diff --git "a/checkpoint-13540/trainer_state.json" "b/checkpoint-13540/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-13540/trainer_state.json" @@ -0,0 +1,94813 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.998277135121831, + "eval_steps": 500, + "global_step": 13540, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00036918533103618014, + "grad_norm": 2.4412319660186768, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.1388, + "step": 1 + }, + { + "epoch": 0.0007383706620723603, + "grad_norm": 2.275918483734131, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.1206, + "step": 2 + }, + { + "epoch": 0.0011075559931085406, + "grad_norm": 2.2667036056518555, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.0982, + "step": 3 + }, + { + "epoch": 0.0014767413241447206, + "grad_norm": 2.4527230262756348, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.1072, + "step": 4 + }, + { + "epoch": 0.0018459266551809008, + "grad_norm": 2.393638849258423, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.0996, + "step": 5 + }, + { + "epoch": 0.002215111986217081, + "grad_norm": 2.389622449874878, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.0901, + "step": 6 + }, + { + "epoch": 0.002584297317253261, + "grad_norm": 2.34468150138855, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.1209, + "step": 7 + }, + { + "epoch": 0.002953482648289441, + "grad_norm": 2.48166561126709, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.1034, + "step": 8 + }, + { + "epoch": 0.0033226679793256215, + "grad_norm": 2.1924567222595215, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.1516, + "step": 9 + }, + { + "epoch": 0.0036918533103618015, + "grad_norm": 2.408474922180176, + "learning_rate": 5.000000000000001e-07, + "loss": 1.1405, + "step": 10 + }, + { + "epoch": 0.004061038641397982, + "grad_norm": 2.1132235527038574, + "learning_rate": 5.5e-07, + "loss": 1.1031, + "step": 11 + }, + { + "epoch": 0.004430223972434162, + "grad_norm": 2.1916306018829346, + "learning_rate": 6.000000000000001e-07, + "loss": 1.1238, + "step": 12 + }, + { + "epoch": 0.004799409303470342, + "grad_norm": 2.040755033493042, + "learning_rate": 6.5e-07, + "loss": 1.1197, + "step": 13 + }, + { + "epoch": 0.005168594634506522, + "grad_norm": 1.9937278032302856, + "learning_rate": 7.000000000000001e-07, + "loss": 1.0578, + "step": 14 + }, + { + "epoch": 0.005537779965542703, + "grad_norm": 2.0224194526672363, + "learning_rate": 7.5e-07, + "loss": 1.0846, + "step": 15 + }, + { + "epoch": 0.005906965296578882, + "grad_norm": 1.80266273021698, + "learning_rate": 8.000000000000001e-07, + "loss": 1.0908, + "step": 16 + }, + { + "epoch": 0.006276150627615063, + "grad_norm": 1.8800767660140991, + "learning_rate": 8.500000000000001e-07, + "loss": 1.0928, + "step": 17 + }, + { + "epoch": 0.006645335958651243, + "grad_norm": 1.6004233360290527, + "learning_rate": 9.000000000000001e-07, + "loss": 1.0828, + "step": 18 + }, + { + "epoch": 0.0070145212896874235, + "grad_norm": 1.653378963470459, + "learning_rate": 9.500000000000001e-07, + "loss": 1.1113, + "step": 19 + }, + { + "epoch": 0.007383706620723603, + "grad_norm": 1.6406723260879517, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.0682, + "step": 20 + }, + { + "epoch": 0.0077528919517597834, + "grad_norm": 1.6181585788726807, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.1055, + "step": 21 + }, + { + "epoch": 0.008122077282795964, + "grad_norm": 1.551200270652771, + "learning_rate": 1.1e-06, + "loss": 1.0795, + "step": 22 + }, + { + "epoch": 0.008491262613832144, + "grad_norm": 1.3663794994354248, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.0283, + "step": 23 + }, + { + "epoch": 0.008860447944868325, + "grad_norm": 1.3748595714569092, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.0784, + "step": 24 + }, + { + "epoch": 0.009229633275904503, + "grad_norm": 1.2911611795425415, + "learning_rate": 1.25e-06, + "loss": 0.9854, + "step": 25 + }, + { + "epoch": 0.009598818606940684, + "grad_norm": 1.3047049045562744, + "learning_rate": 1.3e-06, + "loss": 1.0503, + "step": 26 + }, + { + "epoch": 0.009968003937976864, + "grad_norm": 1.17142653465271, + "learning_rate": 1.3500000000000002e-06, + "loss": 1.0588, + "step": 27 + }, + { + "epoch": 0.010337189269013045, + "grad_norm": 1.2030054330825806, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.0328, + "step": 28 + }, + { + "epoch": 0.010706374600049225, + "grad_norm": 1.131135106086731, + "learning_rate": 1.45e-06, + "loss": 1.0273, + "step": 29 + }, + { + "epoch": 0.011075559931085405, + "grad_norm": 1.0866118669509888, + "learning_rate": 1.5e-06, + "loss": 0.9883, + "step": 30 + }, + { + "epoch": 0.011444745262121586, + "grad_norm": 1.0986360311508179, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.0138, + "step": 31 + }, + { + "epoch": 0.011813930593157764, + "grad_norm": 0.9595009088516235, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.988, + "step": 32 + }, + { + "epoch": 0.012183115924193945, + "grad_norm": 1.054680347442627, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.0522, + "step": 33 + }, + { + "epoch": 0.012552301255230125, + "grad_norm": 0.9745041131973267, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.9978, + "step": 34 + }, + { + "epoch": 0.012921486586266306, + "grad_norm": 0.9892019629478455, + "learning_rate": 1.75e-06, + "loss": 1.046, + "step": 35 + }, + { + "epoch": 0.013290671917302486, + "grad_norm": 0.9731583595275879, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.9907, + "step": 36 + }, + { + "epoch": 0.013659857248338667, + "grad_norm": 0.9275212287902832, + "learning_rate": 1.85e-06, + "loss": 0.9293, + "step": 37 + }, + { + "epoch": 0.014029042579374847, + "grad_norm": 0.9423267245292664, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.9377, + "step": 38 + }, + { + "epoch": 0.014398227910411026, + "grad_norm": 0.9474686980247498, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.9594, + "step": 39 + }, + { + "epoch": 0.014767413241447206, + "grad_norm": 0.9592716693878174, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.9871, + "step": 40 + }, + { + "epoch": 0.015136598572483386, + "grad_norm": 0.9387710094451904, + "learning_rate": 2.05e-06, + "loss": 0.9823, + "step": 41 + }, + { + "epoch": 0.015505783903519567, + "grad_norm": 0.9290558695793152, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.9354, + "step": 42 + }, + { + "epoch": 0.015874969234555746, + "grad_norm": 0.9229576587677002, + "learning_rate": 2.15e-06, + "loss": 0.9828, + "step": 43 + }, + { + "epoch": 0.016244154565591928, + "grad_norm": 0.9243917465209961, + "learning_rate": 2.2e-06, + "loss": 1.0032, + "step": 44 + }, + { + "epoch": 0.016613339896628106, + "grad_norm": 0.9277251958847046, + "learning_rate": 2.25e-06, + "loss": 0.9702, + "step": 45 + }, + { + "epoch": 0.01698252522766429, + "grad_norm": 0.9161118865013123, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.9846, + "step": 46 + }, + { + "epoch": 0.017351710558700467, + "grad_norm": 0.8675879240036011, + "learning_rate": 2.35e-06, + "loss": 0.9333, + "step": 47 + }, + { + "epoch": 0.01772089588973665, + "grad_norm": 0.8792003393173218, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.0106, + "step": 48 + }, + { + "epoch": 0.018090081220772828, + "grad_norm": 0.894873857498169, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.9514, + "step": 49 + }, + { + "epoch": 0.018459266551809007, + "grad_norm": 0.8866007328033447, + "learning_rate": 2.5e-06, + "loss": 1.0057, + "step": 50 + }, + { + "epoch": 0.01882845188284519, + "grad_norm": 0.8863010406494141, + "learning_rate": 2.55e-06, + "loss": 0.9586, + "step": 51 + }, + { + "epoch": 0.019197637213881368, + "grad_norm": 0.912958025932312, + "learning_rate": 2.6e-06, + "loss": 0.9121, + "step": 52 + }, + { + "epoch": 0.01956682254491755, + "grad_norm": 0.8827121257781982, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.9517, + "step": 53 + }, + { + "epoch": 0.01993600787595373, + "grad_norm": 0.8558551669120789, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.9365, + "step": 54 + }, + { + "epoch": 0.02030519320698991, + "grad_norm": 0.8515662550926208, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.9204, + "step": 55 + }, + { + "epoch": 0.02067437853802609, + "grad_norm": 0.844958484172821, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.9801, + "step": 56 + }, + { + "epoch": 0.021043563869062268, + "grad_norm": 0.8569675087928772, + "learning_rate": 2.85e-06, + "loss": 0.9336, + "step": 57 + }, + { + "epoch": 0.02141274920009845, + "grad_norm": 0.8839316964149475, + "learning_rate": 2.9e-06, + "loss": 0.9296, + "step": 58 + }, + { + "epoch": 0.02178193453113463, + "grad_norm": 1.0445549488067627, + "learning_rate": 2.95e-06, + "loss": 0.9134, + "step": 59 + }, + { + "epoch": 0.02215111986217081, + "grad_norm": 0.8282931447029114, + "learning_rate": 3e-06, + "loss": 0.9421, + "step": 60 + }, + { + "epoch": 0.02252030519320699, + "grad_norm": 0.8246078491210938, + "learning_rate": 3.05e-06, + "loss": 0.8743, + "step": 61 + }, + { + "epoch": 0.02288949052424317, + "grad_norm": 0.8587180376052856, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.9303, + "step": 62 + }, + { + "epoch": 0.02325867585527935, + "grad_norm": 0.8575277924537659, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.9503, + "step": 63 + }, + { + "epoch": 0.02362786118631553, + "grad_norm": 0.8518301844596863, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.9225, + "step": 64 + }, + { + "epoch": 0.02399704651735171, + "grad_norm": 0.8324997425079346, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.9158, + "step": 65 + }, + { + "epoch": 0.02436623184838789, + "grad_norm": 0.8463263511657715, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.9311, + "step": 66 + }, + { + "epoch": 0.024735417179424072, + "grad_norm": 0.8261412978172302, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.9121, + "step": 67 + }, + { + "epoch": 0.02510460251046025, + "grad_norm": 0.8466128706932068, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9587, + "step": 68 + }, + { + "epoch": 0.025473787841496433, + "grad_norm": 0.8264670372009277, + "learning_rate": 3.45e-06, + "loss": 0.9301, + "step": 69 + }, + { + "epoch": 0.02584297317253261, + "grad_norm": 0.8121640086174011, + "learning_rate": 3.5e-06, + "loss": 0.9116, + "step": 70 + }, + { + "epoch": 0.02621215850356879, + "grad_norm": 0.8689830303192139, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.9224, + "step": 71 + }, + { + "epoch": 0.026581343834604972, + "grad_norm": 0.8256193399429321, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.8696, + "step": 72 + }, + { + "epoch": 0.02695052916564115, + "grad_norm": 0.8338184952735901, + "learning_rate": 3.65e-06, + "loss": 0.934, + "step": 73 + }, + { + "epoch": 0.027319714496677333, + "grad_norm": 0.8366256356239319, + "learning_rate": 3.7e-06, + "loss": 0.9109, + "step": 74 + }, + { + "epoch": 0.027688899827713512, + "grad_norm": 0.7981867790222168, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.8785, + "step": 75 + }, + { + "epoch": 0.028058085158749694, + "grad_norm": 0.8150340914726257, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.9343, + "step": 76 + }, + { + "epoch": 0.028427270489785873, + "grad_norm": 0.820603609085083, + "learning_rate": 3.85e-06, + "loss": 0.9195, + "step": 77 + }, + { + "epoch": 0.02879645582082205, + "grad_norm": 0.8418338894844055, + "learning_rate": 3.900000000000001e-06, + "loss": 0.9318, + "step": 78 + }, + { + "epoch": 0.029165641151858233, + "grad_norm": 0.8316344022750854, + "learning_rate": 3.95e-06, + "loss": 0.9175, + "step": 79 + }, + { + "epoch": 0.029534826482894412, + "grad_norm": 0.8164108991622925, + "learning_rate": 4.000000000000001e-06, + "loss": 0.914, + "step": 80 + }, + { + "epoch": 0.029904011813930594, + "grad_norm": 0.7849715948104858, + "learning_rate": 4.05e-06, + "loss": 0.8593, + "step": 81 + }, + { + "epoch": 0.030273197144966773, + "grad_norm": 0.859346866607666, + "learning_rate": 4.1e-06, + "loss": 0.8864, + "step": 82 + }, + { + "epoch": 0.030642382476002955, + "grad_norm": 0.8121856451034546, + "learning_rate": 4.15e-06, + "loss": 0.8686, + "step": 83 + }, + { + "epoch": 0.031011567807039134, + "grad_norm": 0.8445794582366943, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.9037, + "step": 84 + }, + { + "epoch": 0.03138075313807531, + "grad_norm": 0.8285534381866455, + "learning_rate": 4.25e-06, + "loss": 0.9309, + "step": 85 + }, + { + "epoch": 0.03174993846911149, + "grad_norm": 0.794826090335846, + "learning_rate": 4.3e-06, + "loss": 0.8836, + "step": 86 + }, + { + "epoch": 0.03211912380014768, + "grad_norm": 0.853547990322113, + "learning_rate": 4.350000000000001e-06, + "loss": 0.8828, + "step": 87 + }, + { + "epoch": 0.032488309131183855, + "grad_norm": 0.8319276571273804, + "learning_rate": 4.4e-06, + "loss": 0.8678, + "step": 88 + }, + { + "epoch": 0.032857494462220034, + "grad_norm": 0.8368034958839417, + "learning_rate": 4.450000000000001e-06, + "loss": 0.9213, + "step": 89 + }, + { + "epoch": 0.03322667979325621, + "grad_norm": 0.7604875564575195, + "learning_rate": 4.5e-06, + "loss": 0.8462, + "step": 90 + }, + { + "epoch": 0.03359586512429239, + "grad_norm": 0.8023838400840759, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.8957, + "step": 91 + }, + { + "epoch": 0.03396505045532858, + "grad_norm": 0.8296010494232178, + "learning_rate": 4.600000000000001e-06, + "loss": 0.8578, + "step": 92 + }, + { + "epoch": 0.034334235786364756, + "grad_norm": 0.802651047706604, + "learning_rate": 4.65e-06, + "loss": 0.8946, + "step": 93 + }, + { + "epoch": 0.034703421117400934, + "grad_norm": 0.8469492793083191, + "learning_rate": 4.7e-06, + "loss": 0.8709, + "step": 94 + }, + { + "epoch": 0.03507260644843711, + "grad_norm": 0.8248879313468933, + "learning_rate": 4.75e-06, + "loss": 0.9127, + "step": 95 + }, + { + "epoch": 0.0354417917794733, + "grad_norm": 0.8265485167503357, + "learning_rate": 4.800000000000001e-06, + "loss": 0.9046, + "step": 96 + }, + { + "epoch": 0.03581097711050948, + "grad_norm": 0.9016802310943604, + "learning_rate": 4.85e-06, + "loss": 0.8448, + "step": 97 + }, + { + "epoch": 0.036180162441545656, + "grad_norm": 0.8030735850334167, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.8666, + "step": 98 + }, + { + "epoch": 0.036549347772581835, + "grad_norm": 0.8260457515716553, + "learning_rate": 4.95e-06, + "loss": 0.87, + "step": 99 + }, + { + "epoch": 0.03691853310361801, + "grad_norm": 0.8534119725227356, + "learning_rate": 5e-06, + "loss": 0.8781, + "step": 100 + }, + { + "epoch": 0.0372877184346542, + "grad_norm": 0.820518970489502, + "learning_rate": 4.999999952687895e-06, + "loss": 0.8719, + "step": 101 + }, + { + "epoch": 0.03765690376569038, + "grad_norm": 0.8288585543632507, + "learning_rate": 4.99999981075158e-06, + "loss": 0.8646, + "step": 102 + }, + { + "epoch": 0.038026089096726556, + "grad_norm": 0.8551903963088989, + "learning_rate": 4.999999574191062e-06, + "loss": 0.8824, + "step": 103 + }, + { + "epoch": 0.038395274427762735, + "grad_norm": 0.8003144860267639, + "learning_rate": 4.999999243006348e-06, + "loss": 0.8534, + "step": 104 + }, + { + "epoch": 0.038764459758798914, + "grad_norm": 0.8019240498542786, + "learning_rate": 4.9999988171974525e-06, + "loss": 0.8696, + "step": 105 + }, + { + "epoch": 0.0391336450898351, + "grad_norm": 0.8382614254951477, + "learning_rate": 4.999998296764391e-06, + "loss": 0.8746, + "step": 106 + }, + { + "epoch": 0.03950283042087128, + "grad_norm": 0.8664233088493347, + "learning_rate": 4.999997681707182e-06, + "loss": 0.9053, + "step": 107 + }, + { + "epoch": 0.03987201575190746, + "grad_norm": 0.8756380677223206, + "learning_rate": 4.99999697202585e-06, + "loss": 0.8714, + "step": 108 + }, + { + "epoch": 0.040241201082943635, + "grad_norm": 1.0035847425460815, + "learning_rate": 4.9999961677204224e-06, + "loss": 0.8111, + "step": 109 + }, + { + "epoch": 0.04061038641397982, + "grad_norm": 0.8640075922012329, + "learning_rate": 4.999995268790928e-06, + "loss": 0.9003, + "step": 110 + }, + { + "epoch": 0.040979571745016, + "grad_norm": 0.8350078463554382, + "learning_rate": 4.999994275237402e-06, + "loss": 0.8816, + "step": 111 + }, + { + "epoch": 0.04134875707605218, + "grad_norm": 0.8058568835258484, + "learning_rate": 4.999993187059882e-06, + "loss": 0.8243, + "step": 112 + }, + { + "epoch": 0.04171794240708836, + "grad_norm": 0.8728750348091125, + "learning_rate": 4.999992004258409e-06, + "loss": 0.8457, + "step": 113 + }, + { + "epoch": 0.042087127738124536, + "grad_norm": 0.8918057084083557, + "learning_rate": 4.999990726833027e-06, + "loss": 0.8412, + "step": 114 + }, + { + "epoch": 0.04245631306916072, + "grad_norm": 0.8393918871879578, + "learning_rate": 4.9999893547837855e-06, + "loss": 0.8806, + "step": 115 + }, + { + "epoch": 0.0428254984001969, + "grad_norm": 0.8215784430503845, + "learning_rate": 4.999987888110736e-06, + "loss": 0.8957, + "step": 116 + }, + { + "epoch": 0.04319468373123308, + "grad_norm": 0.8378515243530273, + "learning_rate": 4.999986326813933e-06, + "loss": 0.816, + "step": 117 + }, + { + "epoch": 0.04356386906226926, + "grad_norm": 0.8467821478843689, + "learning_rate": 4.999984670893438e-06, + "loss": 0.8879, + "step": 118 + }, + { + "epoch": 0.043933054393305436, + "grad_norm": 0.8451763391494751, + "learning_rate": 4.999982920349311e-06, + "loss": 0.8752, + "step": 119 + }, + { + "epoch": 0.04430223972434162, + "grad_norm": 0.8014891743659973, + "learning_rate": 4.99998107518162e-06, + "loss": 0.8279, + "step": 120 + }, + { + "epoch": 0.0446714250553778, + "grad_norm": 0.8478080630302429, + "learning_rate": 4.999979135390434e-06, + "loss": 0.9071, + "step": 121 + }, + { + "epoch": 0.04504061038641398, + "grad_norm": 0.8662890791893005, + "learning_rate": 4.999977100975827e-06, + "loss": 0.8752, + "step": 122 + }, + { + "epoch": 0.04540979571745016, + "grad_norm": 0.8697710633277893, + "learning_rate": 4.999974971937875e-06, + "loss": 0.8386, + "step": 123 + }, + { + "epoch": 0.04577898104848634, + "grad_norm": 0.8541619181632996, + "learning_rate": 4.99997274827666e-06, + "loss": 0.9063, + "step": 124 + }, + { + "epoch": 0.04614816637952252, + "grad_norm": 0.8500178456306458, + "learning_rate": 4.999970429992266e-06, + "loss": 0.8862, + "step": 125 + }, + { + "epoch": 0.0465173517105587, + "grad_norm": 0.8678516149520874, + "learning_rate": 4.9999680170847794e-06, + "loss": 0.8178, + "step": 126 + }, + { + "epoch": 0.04688653704159488, + "grad_norm": 0.867600679397583, + "learning_rate": 4.999965509554293e-06, + "loss": 0.9158, + "step": 127 + }, + { + "epoch": 0.04725572237263106, + "grad_norm": 0.8475435376167297, + "learning_rate": 4.9999629074009005e-06, + "loss": 0.8593, + "step": 128 + }, + { + "epoch": 0.047624907703667244, + "grad_norm": 0.8186939358711243, + "learning_rate": 4.999960210624701e-06, + "loss": 0.8621, + "step": 129 + }, + { + "epoch": 0.04799409303470342, + "grad_norm": 0.9259890913963318, + "learning_rate": 4.999957419225797e-06, + "loss": 0.8642, + "step": 130 + }, + { + "epoch": 0.0483632783657396, + "grad_norm": 0.8416559100151062, + "learning_rate": 4.999954533204293e-06, + "loss": 0.8799, + "step": 131 + }, + { + "epoch": 0.04873246369677578, + "grad_norm": 0.8305281400680542, + "learning_rate": 4.9999515525603e-06, + "loss": 0.8314, + "step": 132 + }, + { + "epoch": 0.04910164902781196, + "grad_norm": 0.8432246446609497, + "learning_rate": 4.999948477293929e-06, + "loss": 0.8687, + "step": 133 + }, + { + "epoch": 0.049470834358848144, + "grad_norm": 0.8665896058082581, + "learning_rate": 4.999945307405297e-06, + "loss": 0.861, + "step": 134 + }, + { + "epoch": 0.04984001968988432, + "grad_norm": 0.8684259057044983, + "learning_rate": 4.9999420428945236e-06, + "loss": 0.9069, + "step": 135 + }, + { + "epoch": 0.0502092050209205, + "grad_norm": 0.8950573801994324, + "learning_rate": 4.999938683761733e-06, + "loss": 0.8553, + "step": 136 + }, + { + "epoch": 0.05057839035195668, + "grad_norm": 0.8906726837158203, + "learning_rate": 4.9999352300070535e-06, + "loss": 0.865, + "step": 137 + }, + { + "epoch": 0.050947575682992866, + "grad_norm": 0.9318857789039612, + "learning_rate": 4.999931681630614e-06, + "loss": 0.8697, + "step": 138 + }, + { + "epoch": 0.051316761014029044, + "grad_norm": 0.8434486389160156, + "learning_rate": 4.999928038632549e-06, + "loss": 0.8665, + "step": 139 + }, + { + "epoch": 0.05168594634506522, + "grad_norm": 0.8642570376396179, + "learning_rate": 4.999924301012997e-06, + "loss": 0.8999, + "step": 140 + }, + { + "epoch": 0.0520551316761014, + "grad_norm": 0.8331663012504578, + "learning_rate": 4.999920468772099e-06, + "loss": 0.8458, + "step": 141 + }, + { + "epoch": 0.05242431700713758, + "grad_norm": 0.949670135974884, + "learning_rate": 4.9999165419100005e-06, + "loss": 0.8552, + "step": 142 + }, + { + "epoch": 0.052793502338173766, + "grad_norm": 0.8622894287109375, + "learning_rate": 4.999912520426849e-06, + "loss": 0.867, + "step": 143 + }, + { + "epoch": 0.053162687669209945, + "grad_norm": 0.8427858948707581, + "learning_rate": 4.999908404322799e-06, + "loss": 0.8842, + "step": 144 + }, + { + "epoch": 0.05353187300024612, + "grad_norm": 0.8718158006668091, + "learning_rate": 4.999904193598003e-06, + "loss": 0.8591, + "step": 145 + }, + { + "epoch": 0.0539010583312823, + "grad_norm": 0.8681777715682983, + "learning_rate": 4.999899888252624e-06, + "loss": 0.8984, + "step": 146 + }, + { + "epoch": 0.05427024366231848, + "grad_norm": 0.8601359128952026, + "learning_rate": 4.999895488286822e-06, + "loss": 0.8386, + "step": 147 + }, + { + "epoch": 0.054639428993354666, + "grad_norm": 0.8220537304878235, + "learning_rate": 4.999890993700766e-06, + "loss": 0.8362, + "step": 148 + }, + { + "epoch": 0.055008614324390845, + "grad_norm": 0.8281165361404419, + "learning_rate": 4.999886404494624e-06, + "loss": 0.8587, + "step": 149 + }, + { + "epoch": 0.055377799655427024, + "grad_norm": 0.8356446623802185, + "learning_rate": 4.999881720668571e-06, + "loss": 0.8988, + "step": 150 + }, + { + "epoch": 0.0557469849864632, + "grad_norm": 0.8825479745864868, + "learning_rate": 4.999876942222783e-06, + "loss": 0.8419, + "step": 151 + }, + { + "epoch": 0.05611617031749939, + "grad_norm": 0.8695886135101318, + "learning_rate": 4.999872069157443e-06, + "loss": 0.852, + "step": 152 + }, + { + "epoch": 0.056485355648535567, + "grad_norm": 0.8561027646064758, + "learning_rate": 4.999867101472733e-06, + "loss": 0.806, + "step": 153 + }, + { + "epoch": 0.056854540979571745, + "grad_norm": 0.8631170988082886, + "learning_rate": 4.999862039168843e-06, + "loss": 0.8227, + "step": 154 + }, + { + "epoch": 0.057223726310607924, + "grad_norm": 0.8831436634063721, + "learning_rate": 4.999856882245963e-06, + "loss": 0.8584, + "step": 155 + }, + { + "epoch": 0.0575929116416441, + "grad_norm": 0.8398553133010864, + "learning_rate": 4.9998516307042895e-06, + "loss": 0.8675, + "step": 156 + }, + { + "epoch": 0.05796209697268029, + "grad_norm": 0.9215529561042786, + "learning_rate": 4.999846284544021e-06, + "loss": 0.844, + "step": 157 + }, + { + "epoch": 0.05833128230371647, + "grad_norm": 0.8526574373245239, + "learning_rate": 4.999840843765359e-06, + "loss": 0.825, + "step": 158 + }, + { + "epoch": 0.058700467634752646, + "grad_norm": 0.8621156811714172, + "learning_rate": 4.99983530836851e-06, + "loss": 0.8762, + "step": 159 + }, + { + "epoch": 0.059069652965788824, + "grad_norm": 0.86864173412323, + "learning_rate": 4.999829678353684e-06, + "loss": 0.8886, + "step": 160 + }, + { + "epoch": 0.059438838296825, + "grad_norm": 0.8976534008979797, + "learning_rate": 4.9998239537210935e-06, + "loss": 0.8452, + "step": 161 + }, + { + "epoch": 0.05980802362786119, + "grad_norm": 0.8699798583984375, + "learning_rate": 4.999818134470955e-06, + "loss": 0.8482, + "step": 162 + }, + { + "epoch": 0.06017720895889737, + "grad_norm": 0.8608055710792542, + "learning_rate": 4.99981222060349e-06, + "loss": 0.8432, + "step": 163 + }, + { + "epoch": 0.060546394289933546, + "grad_norm": 0.8606122732162476, + "learning_rate": 4.999806212118921e-06, + "loss": 0.8591, + "step": 164 + }, + { + "epoch": 0.060915579620969725, + "grad_norm": 0.8934593200683594, + "learning_rate": 4.9998001090174745e-06, + "loss": 0.8672, + "step": 165 + }, + { + "epoch": 0.06128476495200591, + "grad_norm": 0.8507917523384094, + "learning_rate": 4.999793911299384e-06, + "loss": 0.8604, + "step": 166 + }, + { + "epoch": 0.06165395028304209, + "grad_norm": 0.8685324192047119, + "learning_rate": 4.999787618964883e-06, + "loss": 0.8796, + "step": 167 + }, + { + "epoch": 0.06202313561407827, + "grad_norm": 0.9141797423362732, + "learning_rate": 4.9997812320142095e-06, + "loss": 0.8473, + "step": 168 + }, + { + "epoch": 0.062392320945114446, + "grad_norm": 1.0198613405227661, + "learning_rate": 4.9997747504476045e-06, + "loss": 0.8412, + "step": 169 + }, + { + "epoch": 0.06276150627615062, + "grad_norm": 0.8494629263877869, + "learning_rate": 4.999768174265315e-06, + "loss": 0.8515, + "step": 170 + }, + { + "epoch": 0.0631306916071868, + "grad_norm": 0.8553645610809326, + "learning_rate": 4.999761503467589e-06, + "loss": 0.8504, + "step": 171 + }, + { + "epoch": 0.06349987693822298, + "grad_norm": 0.846227765083313, + "learning_rate": 4.999754738054678e-06, + "loss": 0.837, + "step": 172 + }, + { + "epoch": 0.06386906226925917, + "grad_norm": 0.8288367986679077, + "learning_rate": 4.999747878026841e-06, + "loss": 0.8366, + "step": 173 + }, + { + "epoch": 0.06423824760029535, + "grad_norm": 0.8543452620506287, + "learning_rate": 4.9997409233843345e-06, + "loss": 0.8545, + "step": 174 + }, + { + "epoch": 0.06460743293133153, + "grad_norm": 0.8958789110183716, + "learning_rate": 4.999733874127423e-06, + "loss": 0.8324, + "step": 175 + }, + { + "epoch": 0.06497661826236771, + "grad_norm": 0.8512812852859497, + "learning_rate": 4.999726730256373e-06, + "loss": 0.8172, + "step": 176 + }, + { + "epoch": 0.06534580359340389, + "grad_norm": 0.8509172797203064, + "learning_rate": 4.999719491771457e-06, + "loss": 0.8648, + "step": 177 + }, + { + "epoch": 0.06571498892444007, + "grad_norm": 0.8630576133728027, + "learning_rate": 4.999712158672945e-06, + "loss": 0.8518, + "step": 178 + }, + { + "epoch": 0.06608417425547625, + "grad_norm": 0.8487321138381958, + "learning_rate": 4.999704730961118e-06, + "loss": 0.8454, + "step": 179 + }, + { + "epoch": 0.06645335958651243, + "grad_norm": 0.8370772004127502, + "learning_rate": 4.999697208636255e-06, + "loss": 0.7777, + "step": 180 + }, + { + "epoch": 0.0668225449175486, + "grad_norm": 0.867073655128479, + "learning_rate": 4.999689591698642e-06, + "loss": 0.8204, + "step": 181 + }, + { + "epoch": 0.06719173024858478, + "grad_norm": 0.8590624332427979, + "learning_rate": 4.999681880148567e-06, + "loss": 0.885, + "step": 182 + }, + { + "epoch": 0.06756091557962098, + "grad_norm": 0.8786302804946899, + "learning_rate": 4.999674073986322e-06, + "loss": 0.8107, + "step": 183 + }, + { + "epoch": 0.06793010091065715, + "grad_norm": 0.8392751216888428, + "learning_rate": 4.999666173212201e-06, + "loss": 0.8198, + "step": 184 + }, + { + "epoch": 0.06829928624169333, + "grad_norm": 0.8693823218345642, + "learning_rate": 4.999658177826505e-06, + "loss": 0.8278, + "step": 185 + }, + { + "epoch": 0.06866847157272951, + "grad_norm": 0.9009088277816772, + "learning_rate": 4.999650087829536e-06, + "loss": 0.8554, + "step": 186 + }, + { + "epoch": 0.06903765690376569, + "grad_norm": 0.8450184464454651, + "learning_rate": 4.9996419032216e-06, + "loss": 0.7952, + "step": 187 + }, + { + "epoch": 0.06940684223480187, + "grad_norm": 0.851325511932373, + "learning_rate": 4.9996336240030065e-06, + "loss": 0.866, + "step": 188 + }, + { + "epoch": 0.06977602756583805, + "grad_norm": 0.846808135509491, + "learning_rate": 4.99962525017407e-06, + "loss": 0.8526, + "step": 189 + }, + { + "epoch": 0.07014521289687423, + "grad_norm": 0.8484999537467957, + "learning_rate": 4.999616781735106e-06, + "loss": 0.8134, + "step": 190 + }, + { + "epoch": 0.0705143982279104, + "grad_norm": 0.8739628195762634, + "learning_rate": 4.999608218686436e-06, + "loss": 0.7972, + "step": 191 + }, + { + "epoch": 0.0708835835589466, + "grad_norm": 0.8760291934013367, + "learning_rate": 4.999599561028384e-06, + "loss": 0.846, + "step": 192 + }, + { + "epoch": 0.07125276888998278, + "grad_norm": 0.8653873801231384, + "learning_rate": 4.999590808761277e-06, + "loss": 0.8571, + "step": 193 + }, + { + "epoch": 0.07162195422101895, + "grad_norm": 0.89369797706604, + "learning_rate": 4.999581961885447e-06, + "loss": 0.8503, + "step": 194 + }, + { + "epoch": 0.07199113955205513, + "grad_norm": 0.8840173482894897, + "learning_rate": 4.999573020401229e-06, + "loss": 0.8419, + "step": 195 + }, + { + "epoch": 0.07236032488309131, + "grad_norm": 0.8561863899230957, + "learning_rate": 4.9995639843089605e-06, + "loss": 0.7862, + "step": 196 + }, + { + "epoch": 0.07272951021412749, + "grad_norm": 0.8365263342857361, + "learning_rate": 4.9995548536089845e-06, + "loss": 0.8587, + "step": 197 + }, + { + "epoch": 0.07309869554516367, + "grad_norm": 0.9029537439346313, + "learning_rate": 4.9995456283016455e-06, + "loss": 0.8483, + "step": 198 + }, + { + "epoch": 0.07346788087619985, + "grad_norm": 0.8581278324127197, + "learning_rate": 4.999536308387294e-06, + "loss": 0.847, + "step": 199 + }, + { + "epoch": 0.07383706620723603, + "grad_norm": 0.8499622344970703, + "learning_rate": 4.999526893866282e-06, + "loss": 0.8161, + "step": 200 + }, + { + "epoch": 0.07420625153827222, + "grad_norm": 0.8650686740875244, + "learning_rate": 4.999517384738966e-06, + "loss": 0.8218, + "step": 201 + }, + { + "epoch": 0.0745754368693084, + "grad_norm": 0.9009180665016174, + "learning_rate": 4.999507781005705e-06, + "loss": 0.8505, + "step": 202 + }, + { + "epoch": 0.07494462220034458, + "grad_norm": 0.897419273853302, + "learning_rate": 4.9994980826668646e-06, + "loss": 0.7851, + "step": 203 + }, + { + "epoch": 0.07531380753138076, + "grad_norm": 0.8872283697128296, + "learning_rate": 4.99948828972281e-06, + "loss": 0.789, + "step": 204 + }, + { + "epoch": 0.07568299286241693, + "grad_norm": 0.8454247713088989, + "learning_rate": 4.9994784021739115e-06, + "loss": 0.8373, + "step": 205 + }, + { + "epoch": 0.07605217819345311, + "grad_norm": 0.869134247303009, + "learning_rate": 4.999468420020546e-06, + "loss": 0.837, + "step": 206 + }, + { + "epoch": 0.07642136352448929, + "grad_norm": 0.8689702153205872, + "learning_rate": 4.999458343263089e-06, + "loss": 0.8106, + "step": 207 + }, + { + "epoch": 0.07679054885552547, + "grad_norm": 0.8920548558235168, + "learning_rate": 4.999448171901923e-06, + "loss": 0.8608, + "step": 208 + }, + { + "epoch": 0.07715973418656165, + "grad_norm": 0.8637383580207825, + "learning_rate": 4.999437905937431e-06, + "loss": 0.8396, + "step": 209 + }, + { + "epoch": 0.07752891951759783, + "grad_norm": 0.9378350973129272, + "learning_rate": 4.9994275453700045e-06, + "loss": 0.8381, + "step": 210 + }, + { + "epoch": 0.07789810484863402, + "grad_norm": 0.8814995288848877, + "learning_rate": 4.9994170902000335e-06, + "loss": 0.8495, + "step": 211 + }, + { + "epoch": 0.0782672901796702, + "grad_norm": 0.8853291869163513, + "learning_rate": 4.9994065404279155e-06, + "loss": 0.8429, + "step": 212 + }, + { + "epoch": 0.07863647551070638, + "grad_norm": 0.8535809516906738, + "learning_rate": 4.999395896054048e-06, + "loss": 0.8198, + "step": 213 + }, + { + "epoch": 0.07900566084174256, + "grad_norm": 0.889284074306488, + "learning_rate": 4.999385157078835e-06, + "loss": 0.8836, + "step": 214 + }, + { + "epoch": 0.07937484617277873, + "grad_norm": 0.8783283829689026, + "learning_rate": 4.999374323502683e-06, + "loss": 0.8611, + "step": 215 + }, + { + "epoch": 0.07974403150381491, + "grad_norm": 0.851722240447998, + "learning_rate": 4.999363395326e-06, + "loss": 0.828, + "step": 216 + }, + { + "epoch": 0.08011321683485109, + "grad_norm": 0.8967769145965576, + "learning_rate": 4.999352372549203e-06, + "loss": 0.7991, + "step": 217 + }, + { + "epoch": 0.08048240216588727, + "grad_norm": 0.9296314716339111, + "learning_rate": 4.999341255172707e-06, + "loss": 0.893, + "step": 218 + }, + { + "epoch": 0.08085158749692345, + "grad_norm": 0.9072420001029968, + "learning_rate": 4.999330043196933e-06, + "loss": 0.8014, + "step": 219 + }, + { + "epoch": 0.08122077282795964, + "grad_norm": 0.9063705205917358, + "learning_rate": 4.999318736622306e-06, + "loss": 0.8206, + "step": 220 + }, + { + "epoch": 0.08158995815899582, + "grad_norm": 0.8354766964912415, + "learning_rate": 4.9993073354492525e-06, + "loss": 0.8148, + "step": 221 + }, + { + "epoch": 0.081959143490032, + "grad_norm": 0.878901481628418, + "learning_rate": 4.999295839678206e-06, + "loss": 0.8758, + "step": 222 + }, + { + "epoch": 0.08232832882106818, + "grad_norm": 0.8754085302352905, + "learning_rate": 4.999284249309602e-06, + "loss": 0.8447, + "step": 223 + }, + { + "epoch": 0.08269751415210436, + "grad_norm": 0.8729782104492188, + "learning_rate": 4.9992725643438765e-06, + "loss": 0.7938, + "step": 224 + }, + { + "epoch": 0.08306669948314054, + "grad_norm": 0.8772115111351013, + "learning_rate": 4.999260784781473e-06, + "loss": 0.8683, + "step": 225 + }, + { + "epoch": 0.08343588481417671, + "grad_norm": 0.8647124767303467, + "learning_rate": 4.999248910622838e-06, + "loss": 0.8111, + "step": 226 + }, + { + "epoch": 0.08380507014521289, + "grad_norm": 0.85676509141922, + "learning_rate": 4.999236941868421e-06, + "loss": 0.8005, + "step": 227 + }, + { + "epoch": 0.08417425547624907, + "grad_norm": 0.8511557579040527, + "learning_rate": 4.999224878518674e-06, + "loss": 0.8074, + "step": 228 + }, + { + "epoch": 0.08454344080728526, + "grad_norm": 0.8975043892860413, + "learning_rate": 4.9992127205740545e-06, + "loss": 0.8327, + "step": 229 + }, + { + "epoch": 0.08491262613832144, + "grad_norm": 0.863868772983551, + "learning_rate": 4.999200468035021e-06, + "loss": 0.8226, + "step": 230 + }, + { + "epoch": 0.08528181146935762, + "grad_norm": 0.8852335214614868, + "learning_rate": 4.9991881209020406e-06, + "loss": 0.8353, + "step": 231 + }, + { + "epoch": 0.0856509968003938, + "grad_norm": 0.8811020851135254, + "learning_rate": 4.999175679175577e-06, + "loss": 0.8028, + "step": 232 + }, + { + "epoch": 0.08602018213142998, + "grad_norm": 0.8707005381584167, + "learning_rate": 4.999163142856104e-06, + "loss": 0.7983, + "step": 233 + }, + { + "epoch": 0.08638936746246616, + "grad_norm": 0.857227087020874, + "learning_rate": 4.999150511944094e-06, + "loss": 0.7777, + "step": 234 + }, + { + "epoch": 0.08675855279350234, + "grad_norm": 0.8872169256210327, + "learning_rate": 4.999137786440026e-06, + "loss": 0.8692, + "step": 235 + }, + { + "epoch": 0.08712773812453851, + "grad_norm": 0.8813910484313965, + "learning_rate": 4.999124966344381e-06, + "loss": 0.8101, + "step": 236 + }, + { + "epoch": 0.0874969234555747, + "grad_norm": 0.8994487524032593, + "learning_rate": 4.999112051657646e-06, + "loss": 0.8646, + "step": 237 + }, + { + "epoch": 0.08786610878661087, + "grad_norm": 0.9409844875335693, + "learning_rate": 4.999099042380307e-06, + "loss": 0.8218, + "step": 238 + }, + { + "epoch": 0.08823529411764706, + "grad_norm": 0.8659898042678833, + "learning_rate": 4.999085938512859e-06, + "loss": 0.8347, + "step": 239 + }, + { + "epoch": 0.08860447944868324, + "grad_norm": 0.8241569995880127, + "learning_rate": 4.9990727400557965e-06, + "loss": 0.8112, + "step": 240 + }, + { + "epoch": 0.08897366477971942, + "grad_norm": 0.8582605123519897, + "learning_rate": 4.99905944700962e-06, + "loss": 0.8115, + "step": 241 + }, + { + "epoch": 0.0893428501107556, + "grad_norm": 0.8941754698753357, + "learning_rate": 4.999046059374831e-06, + "loss": 0.8304, + "step": 242 + }, + { + "epoch": 0.08971203544179178, + "grad_norm": 0.8905880451202393, + "learning_rate": 4.999032577151939e-06, + "loss": 0.8168, + "step": 243 + }, + { + "epoch": 0.09008122077282796, + "grad_norm": 0.8816720247268677, + "learning_rate": 4.999019000341452e-06, + "loss": 0.8422, + "step": 244 + }, + { + "epoch": 0.09045040610386414, + "grad_norm": 1.047232747077942, + "learning_rate": 4.999005328943884e-06, + "loss": 0.8136, + "step": 245 + }, + { + "epoch": 0.09081959143490032, + "grad_norm": 0.9141537547111511, + "learning_rate": 4.998991562959753e-06, + "loss": 0.8415, + "step": 246 + }, + { + "epoch": 0.0911887767659365, + "grad_norm": 0.8919954895973206, + "learning_rate": 4.998977702389581e-06, + "loss": 0.8224, + "step": 247 + }, + { + "epoch": 0.09155796209697269, + "grad_norm": 0.8710380792617798, + "learning_rate": 4.998963747233891e-06, + "loss": 0.7944, + "step": 248 + }, + { + "epoch": 0.09192714742800887, + "grad_norm": 0.8452226519584656, + "learning_rate": 4.998949697493212e-06, + "loss": 0.8128, + "step": 249 + }, + { + "epoch": 0.09229633275904504, + "grad_norm": 0.8582141399383545, + "learning_rate": 4.998935553168075e-06, + "loss": 0.8467, + "step": 250 + }, + { + "epoch": 0.09266551809008122, + "grad_norm": 0.8774867057800293, + "learning_rate": 4.998921314259017e-06, + "loss": 0.8261, + "step": 251 + }, + { + "epoch": 0.0930347034211174, + "grad_norm": 0.879334568977356, + "learning_rate": 4.998906980766576e-06, + "loss": 0.8346, + "step": 252 + }, + { + "epoch": 0.09340388875215358, + "grad_norm": 0.886013925075531, + "learning_rate": 4.998892552691294e-06, + "loss": 0.8562, + "step": 253 + }, + { + "epoch": 0.09377307408318976, + "grad_norm": 0.8755276203155518, + "learning_rate": 4.998878030033717e-06, + "loss": 0.8005, + "step": 254 + }, + { + "epoch": 0.09414225941422594, + "grad_norm": 0.9009412527084351, + "learning_rate": 4.998863412794396e-06, + "loss": 0.8057, + "step": 255 + }, + { + "epoch": 0.09451144474526212, + "grad_norm": 0.9637260437011719, + "learning_rate": 4.998848700973883e-06, + "loss": 0.8285, + "step": 256 + }, + { + "epoch": 0.09488063007629831, + "grad_norm": 0.8925495743751526, + "learning_rate": 4.9988338945727355e-06, + "loss": 0.8466, + "step": 257 + }, + { + "epoch": 0.09524981540733449, + "grad_norm": 0.88019198179245, + "learning_rate": 4.998818993591513e-06, + "loss": 0.828, + "step": 258 + }, + { + "epoch": 0.09561900073837067, + "grad_norm": 0.8707719445228577, + "learning_rate": 4.998803998030781e-06, + "loss": 0.7912, + "step": 259 + }, + { + "epoch": 0.09598818606940684, + "grad_norm": 0.9469668865203857, + "learning_rate": 4.998788907891107e-06, + "loss": 0.8255, + "step": 260 + }, + { + "epoch": 0.09635737140044302, + "grad_norm": 0.8590503931045532, + "learning_rate": 4.998773723173061e-06, + "loss": 0.834, + "step": 261 + }, + { + "epoch": 0.0967265567314792, + "grad_norm": 0.8726522922515869, + "learning_rate": 4.998758443877217e-06, + "loss": 0.8434, + "step": 262 + }, + { + "epoch": 0.09709574206251538, + "grad_norm": 0.8915356397628784, + "learning_rate": 4.998743070004156e-06, + "loss": 0.8455, + "step": 263 + }, + { + "epoch": 0.09746492739355156, + "grad_norm": 0.8678056597709656, + "learning_rate": 4.998727601554458e-06, + "loss": 0.7851, + "step": 264 + }, + { + "epoch": 0.09783411272458774, + "grad_norm": 0.8804232478141785, + "learning_rate": 4.998712038528709e-06, + "loss": 0.8163, + "step": 265 + }, + { + "epoch": 0.09820329805562392, + "grad_norm": 0.8934099674224854, + "learning_rate": 4.998696380927497e-06, + "loss": 0.8479, + "step": 266 + }, + { + "epoch": 0.09857248338666011, + "grad_norm": 0.9582729339599609, + "learning_rate": 4.998680628751417e-06, + "loss": 0.8351, + "step": 267 + }, + { + "epoch": 0.09894166871769629, + "grad_norm": 0.8772808909416199, + "learning_rate": 4.998664782001063e-06, + "loss": 0.8135, + "step": 268 + }, + { + "epoch": 0.09931085404873247, + "grad_norm": 0.8562557101249695, + "learning_rate": 4.998648840677035e-06, + "loss": 0.7817, + "step": 269 + }, + { + "epoch": 0.09968003937976865, + "grad_norm": 0.9073139429092407, + "learning_rate": 4.9986328047799385e-06, + "loss": 0.7951, + "step": 270 + }, + { + "epoch": 0.10004922471080482, + "grad_norm": 0.9205917119979858, + "learning_rate": 4.9986166743103774e-06, + "loss": 0.8265, + "step": 271 + }, + { + "epoch": 0.100418410041841, + "grad_norm": 0.8653632998466492, + "learning_rate": 4.9986004492689644e-06, + "loss": 0.8301, + "step": 272 + }, + { + "epoch": 0.10078759537287718, + "grad_norm": 0.9200085997581482, + "learning_rate": 4.9985841296563135e-06, + "loss": 0.8534, + "step": 273 + }, + { + "epoch": 0.10115678070391336, + "grad_norm": 0.8734180331230164, + "learning_rate": 4.998567715473041e-06, + "loss": 0.8183, + "step": 274 + }, + { + "epoch": 0.10152596603494954, + "grad_norm": 0.866165816783905, + "learning_rate": 4.99855120671977e-06, + "loss": 0.827, + "step": 275 + }, + { + "epoch": 0.10189515136598573, + "grad_norm": 0.9260159134864807, + "learning_rate": 4.998534603397123e-06, + "loss": 0.7965, + "step": 276 + }, + { + "epoch": 0.10226433669702191, + "grad_norm": 0.8854061365127563, + "learning_rate": 4.998517905505731e-06, + "loss": 0.808, + "step": 277 + }, + { + "epoch": 0.10263352202805809, + "grad_norm": 0.8753185272216797, + "learning_rate": 4.998501113046224e-06, + "loss": 0.8205, + "step": 278 + }, + { + "epoch": 0.10300270735909427, + "grad_norm": 0.8909716606140137, + "learning_rate": 4.998484226019239e-06, + "loss": 0.8521, + "step": 279 + }, + { + "epoch": 0.10337189269013045, + "grad_norm": 0.8754370212554932, + "learning_rate": 4.9984672444254145e-06, + "loss": 0.7891, + "step": 280 + }, + { + "epoch": 0.10374107802116662, + "grad_norm": 0.8687075972557068, + "learning_rate": 4.998450168265393e-06, + "loss": 0.7974, + "step": 281 + }, + { + "epoch": 0.1041102633522028, + "grad_norm": 0.8216086030006409, + "learning_rate": 4.998432997539821e-06, + "loss": 0.7793, + "step": 282 + }, + { + "epoch": 0.10447944868323898, + "grad_norm": 0.899731457233429, + "learning_rate": 4.998415732249349e-06, + "loss": 0.8075, + "step": 283 + }, + { + "epoch": 0.10484863401427516, + "grad_norm": 0.8399525880813599, + "learning_rate": 4.998398372394631e-06, + "loss": 0.8167, + "step": 284 + }, + { + "epoch": 0.10521781934531135, + "grad_norm": 0.8872588276863098, + "learning_rate": 4.998380917976321e-06, + "loss": 0.7981, + "step": 285 + }, + { + "epoch": 0.10558700467634753, + "grad_norm": 0.8770443797111511, + "learning_rate": 4.998363368995083e-06, + "loss": 0.8156, + "step": 286 + }, + { + "epoch": 0.10595619000738371, + "grad_norm": 0.9171691536903381, + "learning_rate": 4.99834572545158e-06, + "loss": 0.7958, + "step": 287 + }, + { + "epoch": 0.10632537533841989, + "grad_norm": 0.8948536515235901, + "learning_rate": 4.99832798734648e-06, + "loss": 0.8092, + "step": 288 + }, + { + "epoch": 0.10669456066945607, + "grad_norm": 0.8909181356430054, + "learning_rate": 4.998310154680453e-06, + "loss": 0.8001, + "step": 289 + }, + { + "epoch": 0.10706374600049225, + "grad_norm": 0.9211814403533936, + "learning_rate": 4.9982922274541765e-06, + "loss": 0.8416, + "step": 290 + }, + { + "epoch": 0.10743293133152843, + "grad_norm": 0.9189214706420898, + "learning_rate": 4.998274205668326e-06, + "loss": 0.7836, + "step": 291 + }, + { + "epoch": 0.1078021166625646, + "grad_norm": 0.9062879085540771, + "learning_rate": 4.998256089323587e-06, + "loss": 0.81, + "step": 292 + }, + { + "epoch": 0.10817130199360078, + "grad_norm": 0.8764585256576538, + "learning_rate": 4.998237878420643e-06, + "loss": 0.8161, + "step": 293 + }, + { + "epoch": 0.10854048732463696, + "grad_norm": 0.8838504552841187, + "learning_rate": 4.998219572960183e-06, + "loss": 0.8339, + "step": 294 + }, + { + "epoch": 0.10890967265567315, + "grad_norm": 0.847440779209137, + "learning_rate": 4.998201172942901e-06, + "loss": 0.8275, + "step": 295 + }, + { + "epoch": 0.10927885798670933, + "grad_norm": 0.9169361591339111, + "learning_rate": 4.998182678369494e-06, + "loss": 0.8022, + "step": 296 + }, + { + "epoch": 0.10964804331774551, + "grad_norm": 0.8472815752029419, + "learning_rate": 4.99816408924066e-06, + "loss": 0.8192, + "step": 297 + }, + { + "epoch": 0.11001722864878169, + "grad_norm": 0.9293531775474548, + "learning_rate": 4.9981454055571045e-06, + "loss": 0.8554, + "step": 298 + }, + { + "epoch": 0.11038641397981787, + "grad_norm": 0.8915771245956421, + "learning_rate": 4.998126627319533e-06, + "loss": 0.8023, + "step": 299 + }, + { + "epoch": 0.11075559931085405, + "grad_norm": 0.9370061755180359, + "learning_rate": 4.998107754528657e-06, + "loss": 0.8097, + "step": 300 + }, + { + "epoch": 0.11112478464189023, + "grad_norm": 0.937268853187561, + "learning_rate": 4.998088787185192e-06, + "loss": 0.8048, + "step": 301 + }, + { + "epoch": 0.1114939699729264, + "grad_norm": 0.8639885187149048, + "learning_rate": 4.998069725289854e-06, + "loss": 0.7987, + "step": 302 + }, + { + "epoch": 0.11186315530396258, + "grad_norm": 0.9094707369804382, + "learning_rate": 4.998050568843364e-06, + "loss": 0.8551, + "step": 303 + }, + { + "epoch": 0.11223234063499878, + "grad_norm": 0.8595545291900635, + "learning_rate": 4.9980313178464504e-06, + "loss": 0.7923, + "step": 304 + }, + { + "epoch": 0.11260152596603495, + "grad_norm": 0.9705724120140076, + "learning_rate": 4.9980119722998396e-06, + "loss": 0.8356, + "step": 305 + }, + { + "epoch": 0.11297071129707113, + "grad_norm": 0.8938621282577515, + "learning_rate": 4.9979925322042635e-06, + "loss": 0.7797, + "step": 306 + }, + { + "epoch": 0.11333989662810731, + "grad_norm": 0.8729509711265564, + "learning_rate": 4.9979729975604584e-06, + "loss": 0.799, + "step": 307 + }, + { + "epoch": 0.11370908195914349, + "grad_norm": 0.8773247599601746, + "learning_rate": 4.997953368369164e-06, + "loss": 0.8094, + "step": 308 + }, + { + "epoch": 0.11407826729017967, + "grad_norm": 0.8874313831329346, + "learning_rate": 4.997933644631122e-06, + "loss": 0.7785, + "step": 309 + }, + { + "epoch": 0.11444745262121585, + "grad_norm": 0.9289500713348389, + "learning_rate": 4.997913826347082e-06, + "loss": 0.8127, + "step": 310 + }, + { + "epoch": 0.11481663795225203, + "grad_norm": 0.901099681854248, + "learning_rate": 4.99789391351779e-06, + "loss": 0.8038, + "step": 311 + }, + { + "epoch": 0.1151858232832882, + "grad_norm": 0.8916109204292297, + "learning_rate": 4.997873906144002e-06, + "loss": 0.8115, + "step": 312 + }, + { + "epoch": 0.1155550086143244, + "grad_norm": 0.9170032143592834, + "learning_rate": 4.997853804226476e-06, + "loss": 0.8181, + "step": 313 + }, + { + "epoch": 0.11592419394536058, + "grad_norm": 0.9100698828697205, + "learning_rate": 4.997833607765971e-06, + "loss": 0.7888, + "step": 314 + }, + { + "epoch": 0.11629337927639675, + "grad_norm": 0.897360622882843, + "learning_rate": 4.997813316763252e-06, + "loss": 0.8152, + "step": 315 + }, + { + "epoch": 0.11666256460743293, + "grad_norm": 0.893099308013916, + "learning_rate": 4.997792931219089e-06, + "loss": 0.7683, + "step": 316 + }, + { + "epoch": 0.11703174993846911, + "grad_norm": 0.9319385290145874, + "learning_rate": 4.9977724511342504e-06, + "loss": 0.7882, + "step": 317 + }, + { + "epoch": 0.11740093526950529, + "grad_norm": 0.9166727662086487, + "learning_rate": 4.997751876509513e-06, + "loss": 0.7975, + "step": 318 + }, + { + "epoch": 0.11777012060054147, + "grad_norm": 0.9373429417610168, + "learning_rate": 4.997731207345655e-06, + "loss": 0.8274, + "step": 319 + }, + { + "epoch": 0.11813930593157765, + "grad_norm": 0.8399270176887512, + "learning_rate": 4.997710443643461e-06, + "loss": 0.7732, + "step": 320 + }, + { + "epoch": 0.11850849126261383, + "grad_norm": 0.8919075727462769, + "learning_rate": 4.997689585403713e-06, + "loss": 0.7933, + "step": 321 + }, + { + "epoch": 0.11887767659365, + "grad_norm": 0.9245322942733765, + "learning_rate": 4.997668632627203e-06, + "loss": 0.8306, + "step": 322 + }, + { + "epoch": 0.1192468619246862, + "grad_norm": 0.89954674243927, + "learning_rate": 4.997647585314723e-06, + "loss": 0.8254, + "step": 323 + }, + { + "epoch": 0.11961604725572238, + "grad_norm": 0.9208563566207886, + "learning_rate": 4.9976264434670714e-06, + "loss": 0.8275, + "step": 324 + }, + { + "epoch": 0.11998523258675856, + "grad_norm": 1.0518693923950195, + "learning_rate": 4.9976052070850465e-06, + "loss": 0.8041, + "step": 325 + }, + { + "epoch": 0.12035441791779473, + "grad_norm": 0.8718807697296143, + "learning_rate": 4.997583876169453e-06, + "loss": 0.8171, + "step": 326 + }, + { + "epoch": 0.12072360324883091, + "grad_norm": 0.8952045440673828, + "learning_rate": 4.997562450721098e-06, + "loss": 0.8005, + "step": 327 + }, + { + "epoch": 0.12109278857986709, + "grad_norm": 0.8763337135314941, + "learning_rate": 4.997540930740792e-06, + "loss": 0.7838, + "step": 328 + }, + { + "epoch": 0.12146197391090327, + "grad_norm": 0.8835309147834778, + "learning_rate": 4.9975193162293505e-06, + "loss": 0.793, + "step": 329 + }, + { + "epoch": 0.12183115924193945, + "grad_norm": 0.9191171526908875, + "learning_rate": 4.997497607187591e-06, + "loss": 0.8317, + "step": 330 + }, + { + "epoch": 0.12220034457297563, + "grad_norm": 0.8889843225479126, + "learning_rate": 4.9974758036163355e-06, + "loss": 0.7937, + "step": 331 + }, + { + "epoch": 0.12256952990401182, + "grad_norm": 0.9089657664299011, + "learning_rate": 4.997453905516408e-06, + "loss": 0.8223, + "step": 332 + }, + { + "epoch": 0.122938715235048, + "grad_norm": 0.92867112159729, + "learning_rate": 4.9974319128886396e-06, + "loss": 0.8092, + "step": 333 + }, + { + "epoch": 0.12330790056608418, + "grad_norm": 0.8917028903961182, + "learning_rate": 4.997409825733861e-06, + "loss": 0.7728, + "step": 334 + }, + { + "epoch": 0.12367708589712036, + "grad_norm": 0.8886356949806213, + "learning_rate": 4.997387644052909e-06, + "loss": 0.8593, + "step": 335 + }, + { + "epoch": 0.12404627122815653, + "grad_norm": 0.8747639060020447, + "learning_rate": 4.997365367846623e-06, + "loss": 0.7963, + "step": 336 + }, + { + "epoch": 0.12441545655919271, + "grad_norm": 0.8846672177314758, + "learning_rate": 4.997342997115846e-06, + "loss": 0.774, + "step": 337 + }, + { + "epoch": 0.12478464189022889, + "grad_norm": 0.8727664947509766, + "learning_rate": 4.997320531861424e-06, + "loss": 0.7894, + "step": 338 + }, + { + "epoch": 0.12515382722126508, + "grad_norm": 0.9146867394447327, + "learning_rate": 4.997297972084209e-06, + "loss": 0.788, + "step": 339 + }, + { + "epoch": 0.12552301255230125, + "grad_norm": 0.8793301582336426, + "learning_rate": 4.997275317785053e-06, + "loss": 0.7873, + "step": 340 + }, + { + "epoch": 0.12589219788333744, + "grad_norm": 0.899469792842865, + "learning_rate": 4.997252568964814e-06, + "loss": 0.8054, + "step": 341 + }, + { + "epoch": 0.1262613832143736, + "grad_norm": 0.8407626152038574, + "learning_rate": 4.997229725624354e-06, + "loss": 0.7782, + "step": 342 + }, + { + "epoch": 0.1266305685454098, + "grad_norm": 0.9121686220169067, + "learning_rate": 4.997206787764537e-06, + "loss": 0.8135, + "step": 343 + }, + { + "epoch": 0.12699975387644596, + "grad_norm": 0.8709003925323486, + "learning_rate": 4.9971837553862324e-06, + "loss": 0.805, + "step": 344 + }, + { + "epoch": 0.12736893920748216, + "grad_norm": 0.8822476267814636, + "learning_rate": 4.997160628490309e-06, + "loss": 0.7888, + "step": 345 + }, + { + "epoch": 0.12773812453851835, + "grad_norm": 0.8969622254371643, + "learning_rate": 4.997137407077645e-06, + "loss": 0.8076, + "step": 346 + }, + { + "epoch": 0.12810730986955451, + "grad_norm": 0.9052038192749023, + "learning_rate": 4.997114091149118e-06, + "loss": 0.8207, + "step": 347 + }, + { + "epoch": 0.1284764952005907, + "grad_norm": 0.860755205154419, + "learning_rate": 4.997090680705611e-06, + "loss": 0.795, + "step": 348 + }, + { + "epoch": 0.12884568053162687, + "grad_norm": 0.8602296710014343, + "learning_rate": 4.99706717574801e-06, + "loss": 0.8034, + "step": 349 + }, + { + "epoch": 0.12921486586266306, + "grad_norm": 0.8612799644470215, + "learning_rate": 4.997043576277203e-06, + "loss": 0.7905, + "step": 350 + }, + { + "epoch": 0.12958405119369923, + "grad_norm": 0.9908099174499512, + "learning_rate": 4.997019882294086e-06, + "loss": 0.8259, + "step": 351 + }, + { + "epoch": 0.12995323652473542, + "grad_norm": 0.8944092988967896, + "learning_rate": 4.996996093799554e-06, + "loss": 0.7836, + "step": 352 + }, + { + "epoch": 0.1303224218557716, + "grad_norm": 0.9142276644706726, + "learning_rate": 4.996972210794509e-06, + "loss": 0.8118, + "step": 353 + }, + { + "epoch": 0.13069160718680778, + "grad_norm": 0.9402908682823181, + "learning_rate": 4.996948233279852e-06, + "loss": 0.8101, + "step": 354 + }, + { + "epoch": 0.13106079251784397, + "grad_norm": 0.9145587682723999, + "learning_rate": 4.996924161256494e-06, + "loss": 0.8238, + "step": 355 + }, + { + "epoch": 0.13142997784888014, + "grad_norm": 0.9261123538017273, + "learning_rate": 4.996899994725344e-06, + "loss": 0.8304, + "step": 356 + }, + { + "epoch": 0.13179916317991633, + "grad_norm": 0.8930022120475769, + "learning_rate": 4.996875733687317e-06, + "loss": 0.7747, + "step": 357 + }, + { + "epoch": 0.1321683485109525, + "grad_norm": 0.8823668360710144, + "learning_rate": 4.9968513781433315e-06, + "loss": 0.8134, + "step": 358 + }, + { + "epoch": 0.1325375338419887, + "grad_norm": 0.8473939299583435, + "learning_rate": 4.996826928094309e-06, + "loss": 0.7833, + "step": 359 + }, + { + "epoch": 0.13290671917302485, + "grad_norm": 0.8742851614952087, + "learning_rate": 4.996802383541176e-06, + "loss": 0.7532, + "step": 360 + }, + { + "epoch": 0.13327590450406104, + "grad_norm": 0.8971150517463684, + "learning_rate": 4.996777744484861e-06, + "loss": 0.7914, + "step": 361 + }, + { + "epoch": 0.1336450898350972, + "grad_norm": 0.8893861174583435, + "learning_rate": 4.996753010926296e-06, + "loss": 0.8268, + "step": 362 + }, + { + "epoch": 0.1340142751661334, + "grad_norm": 0.8671346306800842, + "learning_rate": 4.996728182866418e-06, + "loss": 0.7804, + "step": 363 + }, + { + "epoch": 0.13438346049716957, + "grad_norm": 0.8901523351669312, + "learning_rate": 4.9967032603061655e-06, + "loss": 0.7814, + "step": 364 + }, + { + "epoch": 0.13475264582820576, + "grad_norm": 0.9003875255584717, + "learning_rate": 4.996678243246483e-06, + "loss": 0.8193, + "step": 365 + }, + { + "epoch": 0.13512183115924195, + "grad_norm": 0.8973804116249084, + "learning_rate": 4.996653131688316e-06, + "loss": 0.8228, + "step": 366 + }, + { + "epoch": 0.13549101649027812, + "grad_norm": 0.8611428737640381, + "learning_rate": 4.996627925632617e-06, + "loss": 0.7733, + "step": 367 + }, + { + "epoch": 0.1358602018213143, + "grad_norm": 0.8708634376525879, + "learning_rate": 4.996602625080339e-06, + "loss": 0.7709, + "step": 368 + }, + { + "epoch": 0.13622938715235047, + "grad_norm": 0.930029571056366, + "learning_rate": 4.996577230032439e-06, + "loss": 0.7984, + "step": 369 + }, + { + "epoch": 0.13659857248338667, + "grad_norm": 0.8698320984840393, + "learning_rate": 4.996551740489879e-06, + "loss": 0.7694, + "step": 370 + }, + { + "epoch": 0.13696775781442283, + "grad_norm": 0.8765986561775208, + "learning_rate": 4.996526156453624e-06, + "loss": 0.7992, + "step": 371 + }, + { + "epoch": 0.13733694314545902, + "grad_norm": 0.9100019335746765, + "learning_rate": 4.996500477924642e-06, + "loss": 0.8417, + "step": 372 + }, + { + "epoch": 0.1377061284764952, + "grad_norm": 0.8693497180938721, + "learning_rate": 4.996474704903904e-06, + "loss": 0.7603, + "step": 373 + }, + { + "epoch": 0.13807531380753138, + "grad_norm": 0.874281644821167, + "learning_rate": 4.9964488373923865e-06, + "loss": 0.792, + "step": 374 + }, + { + "epoch": 0.13844449913856757, + "grad_norm": 0.9282156825065613, + "learning_rate": 4.9964228753910685e-06, + "loss": 0.8703, + "step": 375 + }, + { + "epoch": 0.13881368446960374, + "grad_norm": 0.8873798251152039, + "learning_rate": 4.9963968189009324e-06, + "loss": 0.786, + "step": 376 + }, + { + "epoch": 0.13918286980063993, + "grad_norm": 0.868864119052887, + "learning_rate": 4.996370667922965e-06, + "loss": 0.7815, + "step": 377 + }, + { + "epoch": 0.1395520551316761, + "grad_norm": 0.9150336980819702, + "learning_rate": 4.996344422458155e-06, + "loss": 0.7949, + "step": 378 + }, + { + "epoch": 0.1399212404627123, + "grad_norm": 0.8794249892234802, + "learning_rate": 4.996318082507497e-06, + "loss": 0.7503, + "step": 379 + }, + { + "epoch": 0.14029042579374845, + "grad_norm": 0.9185070395469666, + "learning_rate": 4.996291648071988e-06, + "loss": 0.8043, + "step": 380 + }, + { + "epoch": 0.14065961112478464, + "grad_norm": 0.8789845108985901, + "learning_rate": 4.996265119152627e-06, + "loss": 0.7647, + "step": 381 + }, + { + "epoch": 0.1410287964558208, + "grad_norm": 0.8894780278205872, + "learning_rate": 4.99623849575042e-06, + "loss": 0.788, + "step": 382 + }, + { + "epoch": 0.141397981786857, + "grad_norm": 0.9412915110588074, + "learning_rate": 4.996211777866372e-06, + "loss": 0.7974, + "step": 383 + }, + { + "epoch": 0.1417671671178932, + "grad_norm": 0.8994157910346985, + "learning_rate": 4.996184965501497e-06, + "loss": 0.78, + "step": 384 + }, + { + "epoch": 0.14213635244892936, + "grad_norm": 0.9516313076019287, + "learning_rate": 4.9961580586568095e-06, + "loss": 0.8062, + "step": 385 + }, + { + "epoch": 0.14250553777996555, + "grad_norm": 0.8979402184486389, + "learning_rate": 4.996131057333327e-06, + "loss": 0.8339, + "step": 386 + }, + { + "epoch": 0.14287472311100172, + "grad_norm": 0.8623480200767517, + "learning_rate": 4.996103961532072e-06, + "loss": 0.8066, + "step": 387 + }, + { + "epoch": 0.1432439084420379, + "grad_norm": 0.8892715573310852, + "learning_rate": 4.996076771254068e-06, + "loss": 0.7618, + "step": 388 + }, + { + "epoch": 0.14361309377307407, + "grad_norm": 0.9217121005058289, + "learning_rate": 4.9960494865003486e-06, + "loss": 0.8128, + "step": 389 + }, + { + "epoch": 0.14398227910411027, + "grad_norm": 0.9867552518844604, + "learning_rate": 4.996022107271942e-06, + "loss": 0.7973, + "step": 390 + }, + { + "epoch": 0.14435146443514643, + "grad_norm": 0.903056263923645, + "learning_rate": 4.995994633569888e-06, + "loss": 0.804, + "step": 391 + }, + { + "epoch": 0.14472064976618262, + "grad_norm": 0.8795948028564453, + "learning_rate": 4.995967065395223e-06, + "loss": 0.7495, + "step": 392 + }, + { + "epoch": 0.14508983509721882, + "grad_norm": 0.8725371956825256, + "learning_rate": 4.9959394027489934e-06, + "loss": 0.7933, + "step": 393 + }, + { + "epoch": 0.14545902042825498, + "grad_norm": 0.8918120265007019, + "learning_rate": 4.995911645632245e-06, + "loss": 0.7678, + "step": 394 + }, + { + "epoch": 0.14582820575929117, + "grad_norm": 0.934451162815094, + "learning_rate": 4.995883794046029e-06, + "loss": 0.8161, + "step": 395 + }, + { + "epoch": 0.14619739109032734, + "grad_norm": 0.8813429474830627, + "learning_rate": 4.995855847991398e-06, + "loss": 0.7993, + "step": 396 + }, + { + "epoch": 0.14656657642136353, + "grad_norm": 0.8930681943893433, + "learning_rate": 4.995827807469412e-06, + "loss": 0.7737, + "step": 397 + }, + { + "epoch": 0.1469357617523997, + "grad_norm": 0.8840509057044983, + "learning_rate": 4.995799672481131e-06, + "loss": 0.7996, + "step": 398 + }, + { + "epoch": 0.1473049470834359, + "grad_norm": 0.887534499168396, + "learning_rate": 4.9957714430276196e-06, + "loss": 0.8072, + "step": 399 + }, + { + "epoch": 0.14767413241447205, + "grad_norm": 0.9916796684265137, + "learning_rate": 4.995743119109947e-06, + "loss": 0.847, + "step": 400 + }, + { + "epoch": 0.14804331774550825, + "grad_norm": 0.9485662579536438, + "learning_rate": 4.995714700729184e-06, + "loss": 0.8371, + "step": 401 + }, + { + "epoch": 0.14841250307654444, + "grad_norm": 0.9004511833190918, + "learning_rate": 4.995686187886408e-06, + "loss": 0.7994, + "step": 402 + }, + { + "epoch": 0.1487816884075806, + "grad_norm": 0.9183670282363892, + "learning_rate": 4.995657580582699e-06, + "loss": 0.7913, + "step": 403 + }, + { + "epoch": 0.1491508737386168, + "grad_norm": 0.8997277617454529, + "learning_rate": 4.995628878819137e-06, + "loss": 0.7709, + "step": 404 + }, + { + "epoch": 0.14952005906965296, + "grad_norm": 0.9115433096885681, + "learning_rate": 4.9956000825968086e-06, + "loss": 0.8312, + "step": 405 + }, + { + "epoch": 0.14988924440068915, + "grad_norm": 0.9109799861907959, + "learning_rate": 4.995571191916805e-06, + "loss": 0.8222, + "step": 406 + }, + { + "epoch": 0.15025842973172532, + "grad_norm": 0.8981993794441223, + "learning_rate": 4.9955422067802205e-06, + "loss": 0.8333, + "step": 407 + }, + { + "epoch": 0.1506276150627615, + "grad_norm": 1.7644963264465332, + "learning_rate": 4.995513127188151e-06, + "loss": 0.7885, + "step": 408 + }, + { + "epoch": 0.15099680039379768, + "grad_norm": 0.9482977986335754, + "learning_rate": 4.995483953141696e-06, + "loss": 0.8211, + "step": 409 + }, + { + "epoch": 0.15136598572483387, + "grad_norm": 0.9331035017967224, + "learning_rate": 4.995454684641961e-06, + "loss": 0.7764, + "step": 410 + }, + { + "epoch": 0.15173517105587006, + "grad_norm": 0.890426516532898, + "learning_rate": 4.995425321690055e-06, + "loss": 0.7796, + "step": 411 + }, + { + "epoch": 0.15210435638690623, + "grad_norm": 0.9329193234443665, + "learning_rate": 4.995395864287088e-06, + "loss": 0.7815, + "step": 412 + }, + { + "epoch": 0.15247354171794242, + "grad_norm": 0.8611469864845276, + "learning_rate": 4.995366312434174e-06, + "loss": 0.7848, + "step": 413 + }, + { + "epoch": 0.15284272704897858, + "grad_norm": 0.8654336333274841, + "learning_rate": 4.995336666132434e-06, + "loss": 0.7821, + "step": 414 + }, + { + "epoch": 0.15321191238001478, + "grad_norm": 0.904015064239502, + "learning_rate": 4.9953069253829875e-06, + "loss": 0.8034, + "step": 415 + }, + { + "epoch": 0.15358109771105094, + "grad_norm": 0.9867867827415466, + "learning_rate": 4.995277090186962e-06, + "loss": 0.7989, + "step": 416 + }, + { + "epoch": 0.15395028304208713, + "grad_norm": 0.899686336517334, + "learning_rate": 4.995247160545487e-06, + "loss": 0.7969, + "step": 417 + }, + { + "epoch": 0.1543194683731233, + "grad_norm": 0.883914589881897, + "learning_rate": 4.995217136459693e-06, + "loss": 0.8133, + "step": 418 + }, + { + "epoch": 0.1546886537041595, + "grad_norm": 0.8943600654602051, + "learning_rate": 4.995187017930718e-06, + "loss": 0.7676, + "step": 419 + }, + { + "epoch": 0.15505783903519565, + "grad_norm": 0.8856242299079895, + "learning_rate": 4.995156804959702e-06, + "loss": 0.7129, + "step": 420 + }, + { + "epoch": 0.15542702436623185, + "grad_norm": 0.9339002966880798, + "learning_rate": 4.9951264975477895e-06, + "loss": 0.7945, + "step": 421 + }, + { + "epoch": 0.15579620969726804, + "grad_norm": 1.004530429840088, + "learning_rate": 4.995096095696126e-06, + "loss": 0.8219, + "step": 422 + }, + { + "epoch": 0.1561653950283042, + "grad_norm": 0.9293914437294006, + "learning_rate": 4.995065599405862e-06, + "loss": 0.8059, + "step": 423 + }, + { + "epoch": 0.1565345803593404, + "grad_norm": 0.897552490234375, + "learning_rate": 4.995035008678153e-06, + "loss": 0.7482, + "step": 424 + }, + { + "epoch": 0.15690376569037656, + "grad_norm": 0.9051419496536255, + "learning_rate": 4.995004323514157e-06, + "loss": 0.8244, + "step": 425 + }, + { + "epoch": 0.15727295102141275, + "grad_norm": 0.8879945278167725, + "learning_rate": 4.9949735439150335e-06, + "loss": 0.7653, + "step": 426 + }, + { + "epoch": 0.15764213635244892, + "grad_norm": 0.901598334312439, + "learning_rate": 4.99494266988195e-06, + "loss": 0.7852, + "step": 427 + }, + { + "epoch": 0.1580113216834851, + "grad_norm": 0.9374473690986633, + "learning_rate": 4.994911701416073e-06, + "loss": 0.7544, + "step": 428 + }, + { + "epoch": 0.15838050701452128, + "grad_norm": 0.8810298442840576, + "learning_rate": 4.994880638518575e-06, + "loss": 0.7623, + "step": 429 + }, + { + "epoch": 0.15874969234555747, + "grad_norm": 0.9226915836334229, + "learning_rate": 4.994849481190634e-06, + "loss": 0.7503, + "step": 430 + }, + { + "epoch": 0.15911887767659366, + "grad_norm": 0.923017144203186, + "learning_rate": 4.994818229433427e-06, + "loss": 0.8182, + "step": 431 + }, + { + "epoch": 0.15948806300762983, + "grad_norm": 0.9048720002174377, + "learning_rate": 4.994786883248137e-06, + "loss": 0.7976, + "step": 432 + }, + { + "epoch": 0.15985724833866602, + "grad_norm": 0.8946258425712585, + "learning_rate": 4.99475544263595e-06, + "loss": 0.7349, + "step": 433 + }, + { + "epoch": 0.16022643366970218, + "grad_norm": 0.903343915939331, + "learning_rate": 4.994723907598058e-06, + "loss": 0.7816, + "step": 434 + }, + { + "epoch": 0.16059561900073838, + "grad_norm": 0.907153844833374, + "learning_rate": 4.994692278135653e-06, + "loss": 0.7753, + "step": 435 + }, + { + "epoch": 0.16096480433177454, + "grad_norm": 0.950080931186676, + "learning_rate": 4.994660554249933e-06, + "loss": 0.7821, + "step": 436 + }, + { + "epoch": 0.16133398966281073, + "grad_norm": 0.8720895648002625, + "learning_rate": 4.994628735942098e-06, + "loss": 0.8067, + "step": 437 + }, + { + "epoch": 0.1617031749938469, + "grad_norm": 0.9267756938934326, + "learning_rate": 4.994596823213353e-06, + "loss": 0.8121, + "step": 438 + }, + { + "epoch": 0.1620723603248831, + "grad_norm": 0.9312788844108582, + "learning_rate": 4.9945648160649054e-06, + "loss": 0.8059, + "step": 439 + }, + { + "epoch": 0.16244154565591928, + "grad_norm": 0.8700461983680725, + "learning_rate": 4.994532714497966e-06, + "loss": 0.7626, + "step": 440 + }, + { + "epoch": 0.16281073098695545, + "grad_norm": 0.8887227773666382, + "learning_rate": 4.9945005185137515e-06, + "loss": 0.7747, + "step": 441 + }, + { + "epoch": 0.16317991631799164, + "grad_norm": 0.9699864983558655, + "learning_rate": 4.99446822811348e-06, + "loss": 0.7935, + "step": 442 + }, + { + "epoch": 0.1635491016490278, + "grad_norm": 0.9109600782394409, + "learning_rate": 4.994435843298372e-06, + "loss": 0.7375, + "step": 443 + }, + { + "epoch": 0.163918286980064, + "grad_norm": 0.8919878005981445, + "learning_rate": 4.994403364069656e-06, + "loss": 0.7691, + "step": 444 + }, + { + "epoch": 0.16428747231110016, + "grad_norm": 0.9388747215270996, + "learning_rate": 4.994370790428559e-06, + "loss": 0.7982, + "step": 445 + }, + { + "epoch": 0.16465665764213636, + "grad_norm": 0.9099019169807434, + "learning_rate": 4.994338122376315e-06, + "loss": 0.7594, + "step": 446 + }, + { + "epoch": 0.16502584297317252, + "grad_norm": 0.9286575317382812, + "learning_rate": 4.994305359914161e-06, + "loss": 0.8153, + "step": 447 + }, + { + "epoch": 0.1653950283042087, + "grad_norm": 0.896110475063324, + "learning_rate": 4.9942725030433356e-06, + "loss": 0.8013, + "step": 448 + }, + { + "epoch": 0.1657642136352449, + "grad_norm": 0.8885270953178406, + "learning_rate": 4.994239551765083e-06, + "loss": 0.7642, + "step": 449 + }, + { + "epoch": 0.16613339896628107, + "grad_norm": 0.8727377653121948, + "learning_rate": 4.994206506080651e-06, + "loss": 0.7358, + "step": 450 + }, + { + "epoch": 0.16650258429731726, + "grad_norm": 0.9173669219017029, + "learning_rate": 4.9941733659912905e-06, + "loss": 0.8196, + "step": 451 + }, + { + "epoch": 0.16687176962835343, + "grad_norm": 0.9255698323249817, + "learning_rate": 4.994140131498254e-06, + "loss": 0.8184, + "step": 452 + }, + { + "epoch": 0.16724095495938962, + "grad_norm": 0.9053508639335632, + "learning_rate": 4.994106802602802e-06, + "loss": 0.7938, + "step": 453 + }, + { + "epoch": 0.16761014029042579, + "grad_norm": 0.9308109283447266, + "learning_rate": 4.994073379306193e-06, + "loss": 0.8067, + "step": 454 + }, + { + "epoch": 0.16797932562146198, + "grad_norm": 0.8316235542297363, + "learning_rate": 4.994039861609696e-06, + "loss": 0.7799, + "step": 455 + }, + { + "epoch": 0.16834851095249814, + "grad_norm": 0.8959935903549194, + "learning_rate": 4.994006249514575e-06, + "loss": 0.797, + "step": 456 + }, + { + "epoch": 0.16871769628353434, + "grad_norm": 0.8984429240226746, + "learning_rate": 4.993972543022106e-06, + "loss": 0.7763, + "step": 457 + }, + { + "epoch": 0.16908688161457053, + "grad_norm": 0.8788191676139832, + "learning_rate": 4.9939387421335626e-06, + "loss": 0.811, + "step": 458 + }, + { + "epoch": 0.1694560669456067, + "grad_norm": 0.921341061592102, + "learning_rate": 4.993904846850226e-06, + "loss": 0.8022, + "step": 459 + }, + { + "epoch": 0.16982525227664289, + "grad_norm": 0.9294300675392151, + "learning_rate": 4.993870857173378e-06, + "loss": 0.7544, + "step": 460 + }, + { + "epoch": 0.17019443760767905, + "grad_norm": 0.9679121375083923, + "learning_rate": 4.9938367731043035e-06, + "loss": 0.793, + "step": 461 + }, + { + "epoch": 0.17056362293871524, + "grad_norm": 0.9177256226539612, + "learning_rate": 4.993802594644295e-06, + "loss": 0.7825, + "step": 462 + }, + { + "epoch": 0.1709328082697514, + "grad_norm": 0.8937193155288696, + "learning_rate": 4.993768321794645e-06, + "loss": 0.7613, + "step": 463 + }, + { + "epoch": 0.1713019936007876, + "grad_norm": 0.9800901412963867, + "learning_rate": 4.993733954556652e-06, + "loss": 0.8105, + "step": 464 + }, + { + "epoch": 0.17167117893182376, + "grad_norm": 0.8884421586990356, + "learning_rate": 4.9936994929316155e-06, + "loss": 0.759, + "step": 465 + }, + { + "epoch": 0.17204036426285996, + "grad_norm": 0.980379045009613, + "learning_rate": 4.99366493692084e-06, + "loss": 0.8023, + "step": 466 + }, + { + "epoch": 0.17240954959389612, + "grad_norm": 0.8764387369155884, + "learning_rate": 4.993630286525634e-06, + "loss": 0.7574, + "step": 467 + }, + { + "epoch": 0.17277873492493231, + "grad_norm": 0.913542628288269, + "learning_rate": 4.993595541747309e-06, + "loss": 0.8313, + "step": 468 + }, + { + "epoch": 0.1731479202559685, + "grad_norm": 0.9282394647598267, + "learning_rate": 4.993560702587179e-06, + "loss": 0.7718, + "step": 469 + }, + { + "epoch": 0.17351710558700467, + "grad_norm": 0.9678360223770142, + "learning_rate": 4.9935257690465634e-06, + "loss": 0.7783, + "step": 470 + }, + { + "epoch": 0.17388629091804086, + "grad_norm": 0.949858546257019, + "learning_rate": 4.993490741126785e-06, + "loss": 0.814, + "step": 471 + }, + { + "epoch": 0.17425547624907703, + "grad_norm": 0.8861430883407593, + "learning_rate": 4.9934556188291685e-06, + "loss": 0.8069, + "step": 472 + }, + { + "epoch": 0.17462466158011322, + "grad_norm": 0.8986914157867432, + "learning_rate": 4.993420402155044e-06, + "loss": 0.7622, + "step": 473 + }, + { + "epoch": 0.1749938469111494, + "grad_norm": 0.8991053104400635, + "learning_rate": 4.993385091105743e-06, + "loss": 0.7721, + "step": 474 + }, + { + "epoch": 0.17536303224218558, + "grad_norm": 0.9329451322555542, + "learning_rate": 4.993349685682605e-06, + "loss": 0.7656, + "step": 475 + }, + { + "epoch": 0.17573221757322174, + "grad_norm": 0.876977801322937, + "learning_rate": 4.993314185886967e-06, + "loss": 0.8482, + "step": 476 + }, + { + "epoch": 0.17610140290425794, + "grad_norm": 0.9570649266242981, + "learning_rate": 4.9932785917201754e-06, + "loss": 0.8036, + "step": 477 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.8990687727928162, + "learning_rate": 4.993242903183575e-06, + "loss": 0.7766, + "step": 478 + }, + { + "epoch": 0.1768397735663303, + "grad_norm": 0.9275680780410767, + "learning_rate": 4.993207120278518e-06, + "loss": 0.8204, + "step": 479 + }, + { + "epoch": 0.1772089588973665, + "grad_norm": 0.9137722253799438, + "learning_rate": 4.9931712430063585e-06, + "loss": 0.7201, + "step": 480 + }, + { + "epoch": 0.17757814422840265, + "grad_norm": 0.938310980796814, + "learning_rate": 4.993135271368454e-06, + "loss": 0.7859, + "step": 481 + }, + { + "epoch": 0.17794732955943884, + "grad_norm": 0.941462516784668, + "learning_rate": 4.993099205366166e-06, + "loss": 0.8033, + "step": 482 + }, + { + "epoch": 0.178316514890475, + "grad_norm": 0.9050039649009705, + "learning_rate": 4.99306304500086e-06, + "loss": 0.8441, + "step": 483 + }, + { + "epoch": 0.1786857002215112, + "grad_norm": 0.9100262522697449, + "learning_rate": 4.993026790273905e-06, + "loss": 0.7973, + "step": 484 + }, + { + "epoch": 0.17905488555254737, + "grad_norm": 0.8839780688285828, + "learning_rate": 4.992990441186672e-06, + "loss": 0.8029, + "step": 485 + }, + { + "epoch": 0.17942407088358356, + "grad_norm": 0.8999461531639099, + "learning_rate": 4.992953997740538e-06, + "loss": 0.7783, + "step": 486 + }, + { + "epoch": 0.17979325621461975, + "grad_norm": 0.9286245107650757, + "learning_rate": 4.992917459936882e-06, + "loss": 0.7623, + "step": 487 + }, + { + "epoch": 0.18016244154565592, + "grad_norm": 0.9243547320365906, + "learning_rate": 4.992880827777088e-06, + "loss": 0.7888, + "step": 488 + }, + { + "epoch": 0.1805316268766921, + "grad_norm": 0.8967995643615723, + "learning_rate": 4.992844101262541e-06, + "loss": 0.7806, + "step": 489 + }, + { + "epoch": 0.18090081220772827, + "grad_norm": 0.9205966591835022, + "learning_rate": 4.99280728039463e-06, + "loss": 0.7851, + "step": 490 + }, + { + "epoch": 0.18126999753876447, + "grad_norm": 0.9394510388374329, + "learning_rate": 4.992770365174752e-06, + "loss": 0.8065, + "step": 491 + }, + { + "epoch": 0.18163918286980063, + "grad_norm": 0.911313533782959, + "learning_rate": 4.992733355604301e-06, + "loss": 0.8055, + "step": 492 + }, + { + "epoch": 0.18200836820083682, + "grad_norm": 0.9121047854423523, + "learning_rate": 4.99269625168468e-06, + "loss": 0.757, + "step": 493 + }, + { + "epoch": 0.182377553531873, + "grad_norm": 0.8973436951637268, + "learning_rate": 4.9926590534172926e-06, + "loss": 0.7526, + "step": 494 + }, + { + "epoch": 0.18274673886290918, + "grad_norm": 0.9058072566986084, + "learning_rate": 4.992621760803547e-06, + "loss": 0.8152, + "step": 495 + }, + { + "epoch": 0.18311592419394537, + "grad_norm": 0.8761561512947083, + "learning_rate": 4.992584373844853e-06, + "loss": 0.7594, + "step": 496 + }, + { + "epoch": 0.18348510952498154, + "grad_norm": 0.9290655851364136, + "learning_rate": 4.992546892542628e-06, + "loss": 0.801, + "step": 497 + }, + { + "epoch": 0.18385429485601773, + "grad_norm": 0.9174765944480896, + "learning_rate": 4.99250931689829e-06, + "loss": 0.7823, + "step": 498 + }, + { + "epoch": 0.1842234801870539, + "grad_norm": 0.9156612157821655, + "learning_rate": 4.992471646913261e-06, + "loss": 0.8129, + "step": 499 + }, + { + "epoch": 0.1845926655180901, + "grad_norm": 0.9134384989738464, + "learning_rate": 4.992433882588967e-06, + "loss": 0.7436, + "step": 500 + }, + { + "epoch": 0.18496185084912625, + "grad_norm": 0.9095898866653442, + "learning_rate": 4.9923960239268365e-06, + "loss": 0.7668, + "step": 501 + }, + { + "epoch": 0.18533103618016245, + "grad_norm": 0.9294151663780212, + "learning_rate": 4.992358070928304e-06, + "loss": 0.8083, + "step": 502 + }, + { + "epoch": 0.1857002215111986, + "grad_norm": 0.9146490693092346, + "learning_rate": 4.992320023594803e-06, + "loss": 0.7678, + "step": 503 + }, + { + "epoch": 0.1860694068422348, + "grad_norm": 0.9225884079933167, + "learning_rate": 4.992281881927778e-06, + "loss": 0.7853, + "step": 504 + }, + { + "epoch": 0.186438592173271, + "grad_norm": 0.9287518262863159, + "learning_rate": 4.992243645928669e-06, + "loss": 0.8154, + "step": 505 + }, + { + "epoch": 0.18680777750430716, + "grad_norm": 0.9329172372817993, + "learning_rate": 4.992205315598926e-06, + "loss": 0.7989, + "step": 506 + }, + { + "epoch": 0.18717696283534335, + "grad_norm": 0.9103102087974548, + "learning_rate": 4.9921668909399976e-06, + "loss": 0.7687, + "step": 507 + }, + { + "epoch": 0.18754614816637952, + "grad_norm": 0.909018337726593, + "learning_rate": 4.992128371953339e-06, + "loss": 0.7794, + "step": 508 + }, + { + "epoch": 0.1879153334974157, + "grad_norm": 0.8800110220909119, + "learning_rate": 4.992089758640407e-06, + "loss": 0.7365, + "step": 509 + }, + { + "epoch": 0.18828451882845187, + "grad_norm": 0.9656134247779846, + "learning_rate": 4.992051051002665e-06, + "loss": 0.8176, + "step": 510 + }, + { + "epoch": 0.18865370415948807, + "grad_norm": 0.9035509824752808, + "learning_rate": 4.992012249041578e-06, + "loss": 0.7532, + "step": 511 + }, + { + "epoch": 0.18902288949052423, + "grad_norm": 0.9327494502067566, + "learning_rate": 4.9919733527586126e-06, + "loss": 0.7733, + "step": 512 + }, + { + "epoch": 0.18939207482156042, + "grad_norm": 0.918406069278717, + "learning_rate": 4.991934362155243e-06, + "loss": 0.8047, + "step": 513 + }, + { + "epoch": 0.18976126015259662, + "grad_norm": 0.9656594395637512, + "learning_rate": 4.991895277232944e-06, + "loss": 0.7923, + "step": 514 + }, + { + "epoch": 0.19013044548363278, + "grad_norm": 0.9281619787216187, + "learning_rate": 4.991856097993195e-06, + "loss": 0.7801, + "step": 515 + }, + { + "epoch": 0.19049963081466897, + "grad_norm": 0.8924593329429626, + "learning_rate": 4.99181682443748e-06, + "loss": 0.7822, + "step": 516 + }, + { + "epoch": 0.19086881614570514, + "grad_norm": 0.9071043729782104, + "learning_rate": 4.991777456567284e-06, + "loss": 0.7975, + "step": 517 + }, + { + "epoch": 0.19123800147674133, + "grad_norm": 0.9340181350708008, + "learning_rate": 4.991737994384097e-06, + "loss": 0.8108, + "step": 518 + }, + { + "epoch": 0.1916071868077775, + "grad_norm": 0.8986587524414062, + "learning_rate": 4.991698437889414e-06, + "loss": 0.8141, + "step": 519 + }, + { + "epoch": 0.1919763721388137, + "grad_norm": 0.8858151435852051, + "learning_rate": 4.991658787084732e-06, + "loss": 0.7736, + "step": 520 + }, + { + "epoch": 0.19234555746984985, + "grad_norm": 0.9259651899337769, + "learning_rate": 4.991619041971551e-06, + "loss": 0.8044, + "step": 521 + }, + { + "epoch": 0.19271474280088605, + "grad_norm": 0.8833301067352295, + "learning_rate": 4.991579202551376e-06, + "loss": 0.7753, + "step": 522 + }, + { + "epoch": 0.1930839281319222, + "grad_norm": 0.8822750449180603, + "learning_rate": 4.991539268825713e-06, + "loss": 0.7369, + "step": 523 + }, + { + "epoch": 0.1934531134629584, + "grad_norm": 0.900492787361145, + "learning_rate": 4.9914992407960765e-06, + "loss": 0.792, + "step": 524 + }, + { + "epoch": 0.1938222987939946, + "grad_norm": 0.8839775323867798, + "learning_rate": 4.991459118463979e-06, + "loss": 0.7633, + "step": 525 + }, + { + "epoch": 0.19419148412503076, + "grad_norm": 0.9306631088256836, + "learning_rate": 4.991418901830941e-06, + "loss": 0.7585, + "step": 526 + }, + { + "epoch": 0.19456066945606695, + "grad_norm": 0.8982890248298645, + "learning_rate": 4.991378590898483e-06, + "loss": 0.7856, + "step": 527 + }, + { + "epoch": 0.19492985478710312, + "grad_norm": 0.8740949034690857, + "learning_rate": 4.991338185668133e-06, + "loss": 0.8008, + "step": 528 + }, + { + "epoch": 0.1952990401181393, + "grad_norm": 0.9367266297340393, + "learning_rate": 4.991297686141418e-06, + "loss": 0.8715, + "step": 529 + }, + { + "epoch": 0.19566822544917548, + "grad_norm": 0.8908909559249878, + "learning_rate": 4.9912570923198724e-06, + "loss": 0.777, + "step": 530 + }, + { + "epoch": 0.19603741078021167, + "grad_norm": 0.9466776251792908, + "learning_rate": 4.9912164042050315e-06, + "loss": 0.7857, + "step": 531 + }, + { + "epoch": 0.19640659611124783, + "grad_norm": 0.9122392535209656, + "learning_rate": 4.991175621798436e-06, + "loss": 0.7454, + "step": 532 + }, + { + "epoch": 0.19677578144228403, + "grad_norm": 0.9155495762825012, + "learning_rate": 4.99113474510163e-06, + "loss": 0.788, + "step": 533 + }, + { + "epoch": 0.19714496677332022, + "grad_norm": 0.9131052494049072, + "learning_rate": 4.99109377411616e-06, + "loss": 0.7724, + "step": 534 + }, + { + "epoch": 0.19751415210435638, + "grad_norm": 0.9260715246200562, + "learning_rate": 4.9910527088435766e-06, + "loss": 0.7539, + "step": 535 + }, + { + "epoch": 0.19788333743539258, + "grad_norm": 0.9215927124023438, + "learning_rate": 4.991011549285434e-06, + "loss": 0.8061, + "step": 536 + }, + { + "epoch": 0.19825252276642874, + "grad_norm": 0.9671293497085571, + "learning_rate": 4.990970295443291e-06, + "loss": 0.8186, + "step": 537 + }, + { + "epoch": 0.19862170809746493, + "grad_norm": 0.8750205636024475, + "learning_rate": 4.990928947318708e-06, + "loss": 0.7434, + "step": 538 + }, + { + "epoch": 0.1989908934285011, + "grad_norm": 0.9240961670875549, + "learning_rate": 4.990887504913251e-06, + "loss": 0.7365, + "step": 539 + }, + { + "epoch": 0.1993600787595373, + "grad_norm": 0.9260193109512329, + "learning_rate": 4.990845968228488e-06, + "loss": 0.8067, + "step": 540 + }, + { + "epoch": 0.19972926409057346, + "grad_norm": 0.9060749411582947, + "learning_rate": 4.990804337265991e-06, + "loss": 0.8146, + "step": 541 + }, + { + "epoch": 0.20009844942160965, + "grad_norm": 0.9333279728889465, + "learning_rate": 4.9907626120273355e-06, + "loss": 0.7811, + "step": 542 + }, + { + "epoch": 0.20046763475264584, + "grad_norm": 1.0027539730072021, + "learning_rate": 4.990720792514102e-06, + "loss": 0.7587, + "step": 543 + }, + { + "epoch": 0.200836820083682, + "grad_norm": 0.9125142097473145, + "learning_rate": 4.9906788787278725e-06, + "loss": 0.8037, + "step": 544 + }, + { + "epoch": 0.2012060054147182, + "grad_norm": 0.8967962861061096, + "learning_rate": 4.990636870670234e-06, + "loss": 0.7533, + "step": 545 + }, + { + "epoch": 0.20157519074575436, + "grad_norm": 0.9158695936203003, + "learning_rate": 4.9905947683427745e-06, + "loss": 0.7654, + "step": 546 + }, + { + "epoch": 0.20194437607679055, + "grad_norm": 0.9484332203865051, + "learning_rate": 4.99055257174709e-06, + "loss": 0.7419, + "step": 547 + }, + { + "epoch": 0.20231356140782672, + "grad_norm": 0.9259840846061707, + "learning_rate": 4.990510280884777e-06, + "loss": 0.7781, + "step": 548 + }, + { + "epoch": 0.2026827467388629, + "grad_norm": 0.8913872241973877, + "learning_rate": 4.990467895757435e-06, + "loss": 0.7545, + "step": 549 + }, + { + "epoch": 0.20305193206989908, + "grad_norm": 0.9442530870437622, + "learning_rate": 4.99042541636667e-06, + "loss": 0.7402, + "step": 550 + }, + { + "epoch": 0.20342111740093527, + "grad_norm": 0.9780921339988708, + "learning_rate": 4.9903828427140885e-06, + "loss": 0.7911, + "step": 551 + }, + { + "epoch": 0.20379030273197146, + "grad_norm": 0.9168081283569336, + "learning_rate": 4.990340174801302e-06, + "loss": 0.7886, + "step": 552 + }, + { + "epoch": 0.20415948806300763, + "grad_norm": 0.8821970224380493, + "learning_rate": 4.990297412629926e-06, + "loss": 0.8018, + "step": 553 + }, + { + "epoch": 0.20452867339404382, + "grad_norm": 0.8984123468399048, + "learning_rate": 4.99025455620158e-06, + "loss": 0.7459, + "step": 554 + }, + { + "epoch": 0.20489785872507998, + "grad_norm": 0.9142769575119019, + "learning_rate": 4.990211605517884e-06, + "loss": 0.7697, + "step": 555 + }, + { + "epoch": 0.20526704405611618, + "grad_norm": 0.8822063207626343, + "learning_rate": 4.990168560580465e-06, + "loss": 0.801, + "step": 556 + }, + { + "epoch": 0.20563622938715234, + "grad_norm": 0.9008304476737976, + "learning_rate": 4.990125421390952e-06, + "loss": 0.7478, + "step": 557 + }, + { + "epoch": 0.20600541471818853, + "grad_norm": 0.9141461849212646, + "learning_rate": 4.990082187950977e-06, + "loss": 0.7356, + "step": 558 + }, + { + "epoch": 0.2063746000492247, + "grad_norm": 0.9026126265525818, + "learning_rate": 4.9900388602621775e-06, + "loss": 0.7594, + "step": 559 + }, + { + "epoch": 0.2067437853802609, + "grad_norm": 0.9227587580680847, + "learning_rate": 4.989995438326193e-06, + "loss": 0.7465, + "step": 560 + }, + { + "epoch": 0.20711297071129708, + "grad_norm": 0.9264957904815674, + "learning_rate": 4.989951922144667e-06, + "loss": 0.799, + "step": 561 + }, + { + "epoch": 0.20748215604233325, + "grad_norm": 0.9035301208496094, + "learning_rate": 4.989908311719247e-06, + "loss": 0.8013, + "step": 562 + }, + { + "epoch": 0.20785134137336944, + "grad_norm": 0.9293225407600403, + "learning_rate": 4.989864607051583e-06, + "loss": 0.78, + "step": 563 + }, + { + "epoch": 0.2082205267044056, + "grad_norm": 0.9383095502853394, + "learning_rate": 4.989820808143328e-06, + "loss": 0.7629, + "step": 564 + }, + { + "epoch": 0.2085897120354418, + "grad_norm": 0.9269458055496216, + "learning_rate": 4.989776914996144e-06, + "loss": 0.7698, + "step": 565 + }, + { + "epoch": 0.20895889736647796, + "grad_norm": 0.8632141351699829, + "learning_rate": 4.989732927611688e-06, + "loss": 0.7084, + "step": 566 + }, + { + "epoch": 0.20932808269751416, + "grad_norm": 0.9031556844711304, + "learning_rate": 4.989688845991626e-06, + "loss": 0.7425, + "step": 567 + }, + { + "epoch": 0.20969726802855032, + "grad_norm": 0.9389427304267883, + "learning_rate": 4.989644670137627e-06, + "loss": 0.847, + "step": 568 + }, + { + "epoch": 0.2100664533595865, + "grad_norm": 0.8885316252708435, + "learning_rate": 4.9896004000513635e-06, + "loss": 0.768, + "step": 569 + }, + { + "epoch": 0.2104356386906227, + "grad_norm": 0.9279949069023132, + "learning_rate": 4.989556035734511e-06, + "loss": 0.8118, + "step": 570 + }, + { + "epoch": 0.21080482402165887, + "grad_norm": 0.9301499724388123, + "learning_rate": 4.989511577188748e-06, + "loss": 0.7743, + "step": 571 + }, + { + "epoch": 0.21117400935269506, + "grad_norm": 0.9175605773925781, + "learning_rate": 4.989467024415757e-06, + "loss": 0.7543, + "step": 572 + }, + { + "epoch": 0.21154319468373123, + "grad_norm": 0.8823210597038269, + "learning_rate": 4.989422377417225e-06, + "loss": 0.7554, + "step": 573 + }, + { + "epoch": 0.21191238001476742, + "grad_norm": 0.9052088260650635, + "learning_rate": 4.989377636194842e-06, + "loss": 0.7853, + "step": 574 + }, + { + "epoch": 0.21228156534580359, + "grad_norm": 0.9024859666824341, + "learning_rate": 4.9893328007503e-06, + "loss": 0.7822, + "step": 575 + }, + { + "epoch": 0.21265075067683978, + "grad_norm": 0.8866460919380188, + "learning_rate": 4.989287871085299e-06, + "loss": 0.781, + "step": 576 + }, + { + "epoch": 0.21301993600787594, + "grad_norm": 0.9139004349708557, + "learning_rate": 4.989242847201537e-06, + "loss": 0.7486, + "step": 577 + }, + { + "epoch": 0.21338912133891214, + "grad_norm": 0.912174642086029, + "learning_rate": 4.9891977291007174e-06, + "loss": 0.7911, + "step": 578 + }, + { + "epoch": 0.2137583066699483, + "grad_norm": 0.9372491240501404, + "learning_rate": 4.989152516784551e-06, + "loss": 0.7751, + "step": 579 + }, + { + "epoch": 0.2141274920009845, + "grad_norm": 0.9477733373641968, + "learning_rate": 4.989107210254748e-06, + "loss": 0.7798, + "step": 580 + }, + { + "epoch": 0.21449667733202069, + "grad_norm": 0.9927518367767334, + "learning_rate": 4.989061809513021e-06, + "loss": 0.7844, + "step": 581 + }, + { + "epoch": 0.21486586266305685, + "grad_norm": 0.9412055015563965, + "learning_rate": 4.98901631456109e-06, + "loss": 0.8244, + "step": 582 + }, + { + "epoch": 0.21523504799409304, + "grad_norm": 0.8675752282142639, + "learning_rate": 4.988970725400678e-06, + "loss": 0.7988, + "step": 583 + }, + { + "epoch": 0.2156042333251292, + "grad_norm": 0.9113022089004517, + "learning_rate": 4.98892504203351e-06, + "loss": 0.7368, + "step": 584 + }, + { + "epoch": 0.2159734186561654, + "grad_norm": 0.949691116809845, + "learning_rate": 4.988879264461314e-06, + "loss": 0.7842, + "step": 585 + }, + { + "epoch": 0.21634260398720156, + "grad_norm": 0.9027278423309326, + "learning_rate": 4.9888333926858235e-06, + "loss": 0.7686, + "step": 586 + }, + { + "epoch": 0.21671178931823776, + "grad_norm": 0.926006019115448, + "learning_rate": 4.988787426708775e-06, + "loss": 0.8051, + "step": 587 + }, + { + "epoch": 0.21708097464927392, + "grad_norm": 0.9157929420471191, + "learning_rate": 4.988741366531906e-06, + "loss": 0.7682, + "step": 588 + }, + { + "epoch": 0.21745015998031011, + "grad_norm": 0.9037911295890808, + "learning_rate": 4.988695212156963e-06, + "loss": 0.7853, + "step": 589 + }, + { + "epoch": 0.2178193453113463, + "grad_norm": 0.927545964717865, + "learning_rate": 4.988648963585692e-06, + "loss": 0.8019, + "step": 590 + }, + { + "epoch": 0.21818853064238247, + "grad_norm": 0.901202917098999, + "learning_rate": 4.988602620819843e-06, + "loss": 0.7602, + "step": 591 + }, + { + "epoch": 0.21855771597341866, + "grad_norm": 0.92628413438797, + "learning_rate": 4.98855618386117e-06, + "loss": 0.7674, + "step": 592 + }, + { + "epoch": 0.21892690130445483, + "grad_norm": 0.9130452871322632, + "learning_rate": 4.988509652711431e-06, + "loss": 0.7891, + "step": 593 + }, + { + "epoch": 0.21929608663549102, + "grad_norm": 0.8740320801734924, + "learning_rate": 4.988463027372387e-06, + "loss": 0.7462, + "step": 594 + }, + { + "epoch": 0.2196652719665272, + "grad_norm": 0.9068711400032043, + "learning_rate": 4.9884163078458026e-06, + "loss": 0.7934, + "step": 595 + }, + { + "epoch": 0.22003445729756338, + "grad_norm": 0.8960202932357788, + "learning_rate": 4.988369494133447e-06, + "loss": 0.7855, + "step": 596 + }, + { + "epoch": 0.22040364262859954, + "grad_norm": 0.8958917260169983, + "learning_rate": 4.98832258623709e-06, + "loss": 0.7641, + "step": 597 + }, + { + "epoch": 0.22077282795963574, + "grad_norm": 0.9499006271362305, + "learning_rate": 4.988275584158509e-06, + "loss": 0.773, + "step": 598 + }, + { + "epoch": 0.22114201329067193, + "grad_norm": 0.9037056565284729, + "learning_rate": 4.988228487899483e-06, + "loss": 0.7528, + "step": 599 + }, + { + "epoch": 0.2215111986217081, + "grad_norm": 0.9397220611572266, + "learning_rate": 4.988181297461794e-06, + "loss": 0.7659, + "step": 600 + }, + { + "epoch": 0.2218803839527443, + "grad_norm": 0.91253262758255, + "learning_rate": 4.988134012847228e-06, + "loss": 0.7959, + "step": 601 + }, + { + "epoch": 0.22224956928378045, + "grad_norm": 0.9446122646331787, + "learning_rate": 4.988086634057575e-06, + "loss": 0.7876, + "step": 602 + }, + { + "epoch": 0.22261875461481664, + "grad_norm": 0.9358010292053223, + "learning_rate": 4.9880391610946276e-06, + "loss": 0.7768, + "step": 603 + }, + { + "epoch": 0.2229879399458528, + "grad_norm": 0.8857962489128113, + "learning_rate": 4.987991593960184e-06, + "loss": 0.7973, + "step": 604 + }, + { + "epoch": 0.223357125276889, + "grad_norm": 0.8937807679176331, + "learning_rate": 4.987943932656043e-06, + "loss": 0.7557, + "step": 605 + }, + { + "epoch": 0.22372631060792517, + "grad_norm": 0.9220109581947327, + "learning_rate": 4.9878961771840096e-06, + "loss": 0.8054, + "step": 606 + }, + { + "epoch": 0.22409549593896136, + "grad_norm": 0.8997550010681152, + "learning_rate": 4.987848327545891e-06, + "loss": 0.7715, + "step": 607 + }, + { + "epoch": 0.22446468126999755, + "grad_norm": 0.9468237161636353, + "learning_rate": 4.9878003837434986e-06, + "loss": 0.7738, + "step": 608 + }, + { + "epoch": 0.22483386660103372, + "grad_norm": 0.9114269018173218, + "learning_rate": 4.987752345778647e-06, + "loss": 0.7656, + "step": 609 + }, + { + "epoch": 0.2252030519320699, + "grad_norm": 0.9273460507392883, + "learning_rate": 4.987704213653154e-06, + "loss": 0.7221, + "step": 610 + }, + { + "epoch": 0.22557223726310607, + "grad_norm": 1.0038539171218872, + "learning_rate": 4.987655987368842e-06, + "loss": 0.7798, + "step": 611 + }, + { + "epoch": 0.22594142259414227, + "grad_norm": 0.9012103080749512, + "learning_rate": 4.987607666927535e-06, + "loss": 0.7526, + "step": 612 + }, + { + "epoch": 0.22631060792517843, + "grad_norm": 0.9375380277633667, + "learning_rate": 4.987559252331064e-06, + "loss": 0.7864, + "step": 613 + }, + { + "epoch": 0.22667979325621462, + "grad_norm": 0.9294722080230713, + "learning_rate": 4.98751074358126e-06, + "loss": 0.7441, + "step": 614 + }, + { + "epoch": 0.2270489785872508, + "grad_norm": 0.9203857183456421, + "learning_rate": 4.9874621406799595e-06, + "loss": 0.7508, + "step": 615 + }, + { + "epoch": 0.22741816391828698, + "grad_norm": 0.9149799346923828, + "learning_rate": 4.987413443629002e-06, + "loss": 0.764, + "step": 616 + }, + { + "epoch": 0.22778734924932317, + "grad_norm": 0.9542253613471985, + "learning_rate": 4.987364652430231e-06, + "loss": 0.7515, + "step": 617 + }, + { + "epoch": 0.22815653458035934, + "grad_norm": 0.9015132784843445, + "learning_rate": 4.9873157670854925e-06, + "loss": 0.7474, + "step": 618 + }, + { + "epoch": 0.22852571991139553, + "grad_norm": 0.8520395159721375, + "learning_rate": 4.987266787596637e-06, + "loss": 0.7202, + "step": 619 + }, + { + "epoch": 0.2288949052424317, + "grad_norm": 0.939619243144989, + "learning_rate": 4.987217713965519e-06, + "loss": 0.7998, + "step": 620 + }, + { + "epoch": 0.2292640905734679, + "grad_norm": 0.9087517857551575, + "learning_rate": 4.9871685461939954e-06, + "loss": 0.7436, + "step": 621 + }, + { + "epoch": 0.22963327590450405, + "grad_norm": 0.9153372049331665, + "learning_rate": 4.9871192842839264e-06, + "loss": 0.7439, + "step": 622 + }, + { + "epoch": 0.23000246123554025, + "grad_norm": 0.9004572629928589, + "learning_rate": 4.98706992823718e-06, + "loss": 0.7366, + "step": 623 + }, + { + "epoch": 0.2303716465665764, + "grad_norm": 0.8933371305465698, + "learning_rate": 4.9870204780556185e-06, + "loss": 0.7707, + "step": 624 + }, + { + "epoch": 0.2307408318976126, + "grad_norm": 0.9173306822776794, + "learning_rate": 4.9869709337411184e-06, + "loss": 0.769, + "step": 625 + }, + { + "epoch": 0.2311100172286488, + "grad_norm": 0.9018936157226562, + "learning_rate": 4.986921295295554e-06, + "loss": 0.7574, + "step": 626 + }, + { + "epoch": 0.23147920255968496, + "grad_norm": 0.9065893888473511, + "learning_rate": 4.986871562720803e-06, + "loss": 0.7649, + "step": 627 + }, + { + "epoch": 0.23184838789072115, + "grad_norm": 0.875457763671875, + "learning_rate": 4.986821736018748e-06, + "loss": 0.7606, + "step": 628 + }, + { + "epoch": 0.23221757322175732, + "grad_norm": 0.9208522439002991, + "learning_rate": 4.986771815191275e-06, + "loss": 0.7883, + "step": 629 + }, + { + "epoch": 0.2325867585527935, + "grad_norm": 0.9178383350372314, + "learning_rate": 4.986721800240273e-06, + "loss": 0.7694, + "step": 630 + }, + { + "epoch": 0.23295594388382967, + "grad_norm": 0.9127592444419861, + "learning_rate": 4.986671691167637e-06, + "loss": 0.7405, + "step": 631 + }, + { + "epoch": 0.23332512921486587, + "grad_norm": 0.9244971871376038, + "learning_rate": 4.986621487975261e-06, + "loss": 0.796, + "step": 632 + }, + { + "epoch": 0.23369431454590203, + "grad_norm": 0.9289013147354126, + "learning_rate": 4.9865711906650485e-06, + "loss": 0.7398, + "step": 633 + }, + { + "epoch": 0.23406349987693822, + "grad_norm": 0.8860224485397339, + "learning_rate": 4.9865207992389e-06, + "loss": 0.7518, + "step": 634 + }, + { + "epoch": 0.2344326852079744, + "grad_norm": 0.983054518699646, + "learning_rate": 4.986470313698723e-06, + "loss": 0.772, + "step": 635 + }, + { + "epoch": 0.23480187053901058, + "grad_norm": 0.896415650844574, + "learning_rate": 4.98641973404643e-06, + "loss": 0.783, + "step": 636 + }, + { + "epoch": 0.23517105587004677, + "grad_norm": 0.9165555238723755, + "learning_rate": 4.986369060283935e-06, + "loss": 0.7902, + "step": 637 + }, + { + "epoch": 0.23554024120108294, + "grad_norm": 0.928102970123291, + "learning_rate": 4.986318292413157e-06, + "loss": 0.7458, + "step": 638 + }, + { + "epoch": 0.23590942653211913, + "grad_norm": 0.8924850821495056, + "learning_rate": 4.986267430436015e-06, + "loss": 0.7464, + "step": 639 + }, + { + "epoch": 0.2362786118631553, + "grad_norm": 0.9051222801208496, + "learning_rate": 4.986216474354436e-06, + "loss": 0.7789, + "step": 640 + }, + { + "epoch": 0.2366477971941915, + "grad_norm": 0.9151371717453003, + "learning_rate": 4.986165424170347e-06, + "loss": 0.7722, + "step": 641 + }, + { + "epoch": 0.23701698252522765, + "grad_norm": 0.9557187557220459, + "learning_rate": 4.9861142798856824e-06, + "loss": 0.7606, + "step": 642 + }, + { + "epoch": 0.23738616785626385, + "grad_norm": 0.9134449362754822, + "learning_rate": 4.986063041502377e-06, + "loss": 0.7482, + "step": 643 + }, + { + "epoch": 0.2377553531873, + "grad_norm": 0.920628547668457, + "learning_rate": 4.9860117090223695e-06, + "loss": 0.7545, + "step": 644 + }, + { + "epoch": 0.2381245385183362, + "grad_norm": 0.9102922081947327, + "learning_rate": 4.9859602824476035e-06, + "loss": 0.7575, + "step": 645 + }, + { + "epoch": 0.2384937238493724, + "grad_norm": 0.9474995136260986, + "learning_rate": 4.985908761780025e-06, + "loss": 0.7511, + "step": 646 + }, + { + "epoch": 0.23886290918040856, + "grad_norm": 0.9534257650375366, + "learning_rate": 4.9858571470215854e-06, + "loss": 0.7846, + "step": 647 + }, + { + "epoch": 0.23923209451144475, + "grad_norm": 0.9420318007469177, + "learning_rate": 4.9858054381742374e-06, + "loss": 0.7846, + "step": 648 + }, + { + "epoch": 0.23960127984248092, + "grad_norm": 0.9332766532897949, + "learning_rate": 4.9857536352399376e-06, + "loss": 0.7763, + "step": 649 + }, + { + "epoch": 0.2399704651735171, + "grad_norm": 0.8975653648376465, + "learning_rate": 4.985701738220647e-06, + "loss": 0.7375, + "step": 650 + }, + { + "epoch": 0.24033965050455328, + "grad_norm": 1.0437159538269043, + "learning_rate": 4.98564974711833e-06, + "loss": 0.7838, + "step": 651 + }, + { + "epoch": 0.24070883583558947, + "grad_norm": 0.9787809252738953, + "learning_rate": 4.985597661934955e-06, + "loss": 0.8084, + "step": 652 + }, + { + "epoch": 0.24107802116662563, + "grad_norm": 0.9051234722137451, + "learning_rate": 4.985545482672493e-06, + "loss": 0.7251, + "step": 653 + }, + { + "epoch": 0.24144720649766183, + "grad_norm": 0.9520812630653381, + "learning_rate": 4.985493209332918e-06, + "loss": 0.7966, + "step": 654 + }, + { + "epoch": 0.24181639182869802, + "grad_norm": 0.9688146114349365, + "learning_rate": 4.985440841918211e-06, + "loss": 0.7541, + "step": 655 + }, + { + "epoch": 0.24218557715973418, + "grad_norm": 0.9103109240531921, + "learning_rate": 4.9853883804303515e-06, + "loss": 0.7515, + "step": 656 + }, + { + "epoch": 0.24255476249077038, + "grad_norm": 0.8874120712280273, + "learning_rate": 4.9853358248713266e-06, + "loss": 0.7416, + "step": 657 + }, + { + "epoch": 0.24292394782180654, + "grad_norm": 0.8995088338851929, + "learning_rate": 4.9852831752431256e-06, + "loss": 0.7567, + "step": 658 + }, + { + "epoch": 0.24329313315284273, + "grad_norm": 0.9225655198097229, + "learning_rate": 4.98523043154774e-06, + "loss": 0.754, + "step": 659 + }, + { + "epoch": 0.2436623184838789, + "grad_norm": 0.9014883637428284, + "learning_rate": 4.985177593787167e-06, + "loss": 0.7289, + "step": 660 + }, + { + "epoch": 0.2440315038149151, + "grad_norm": 1.0416333675384521, + "learning_rate": 4.9851246619634054e-06, + "loss": 0.7705, + "step": 661 + }, + { + "epoch": 0.24440068914595126, + "grad_norm": 0.9075980186462402, + "learning_rate": 4.9850716360784615e-06, + "loss": 0.7608, + "step": 662 + }, + { + "epoch": 0.24476987447698745, + "grad_norm": 0.8677279949188232, + "learning_rate": 4.98501851613434e-06, + "loss": 0.7988, + "step": 663 + }, + { + "epoch": 0.24513905980802364, + "grad_norm": 0.9182518124580383, + "learning_rate": 4.984965302133051e-06, + "loss": 0.7687, + "step": 664 + }, + { + "epoch": 0.2455082451390598, + "grad_norm": 0.9112989902496338, + "learning_rate": 4.98491199407661e-06, + "loss": 0.7842, + "step": 665 + }, + { + "epoch": 0.245877430470096, + "grad_norm": 0.8956560492515564, + "learning_rate": 4.984858591967035e-06, + "loss": 0.7418, + "step": 666 + }, + { + "epoch": 0.24624661580113216, + "grad_norm": 0.9098480343818665, + "learning_rate": 4.984805095806346e-06, + "loss": 0.7978, + "step": 667 + }, + { + "epoch": 0.24661580113216836, + "grad_norm": 0.8892375230789185, + "learning_rate": 4.9847515055965685e-06, + "loss": 0.7563, + "step": 668 + }, + { + "epoch": 0.24698498646320452, + "grad_norm": 0.9025793671607971, + "learning_rate": 4.984697821339731e-06, + "loss": 0.7302, + "step": 669 + }, + { + "epoch": 0.2473541717942407, + "grad_norm": 0.9058409333229065, + "learning_rate": 4.984644043037864e-06, + "loss": 0.7938, + "step": 670 + }, + { + "epoch": 0.24772335712527688, + "grad_norm": 0.9041579365730286, + "learning_rate": 4.984590170693005e-06, + "loss": 0.7788, + "step": 671 + }, + { + "epoch": 0.24809254245631307, + "grad_norm": 0.9054029583930969, + "learning_rate": 4.9845362043071925e-06, + "loss": 0.7546, + "step": 672 + }, + { + "epoch": 0.24846172778734926, + "grad_norm": 0.9551224708557129, + "learning_rate": 4.984482143882469e-06, + "loss": 0.7458, + "step": 673 + }, + { + "epoch": 0.24883091311838543, + "grad_norm": 0.9546729922294617, + "learning_rate": 4.9844279894208795e-06, + "loss": 0.7808, + "step": 674 + }, + { + "epoch": 0.24920009844942162, + "grad_norm": 0.9500798583030701, + "learning_rate": 4.984373740924475e-06, + "loss": 0.7773, + "step": 675 + }, + { + "epoch": 0.24956928378045778, + "grad_norm": 0.9165859222412109, + "learning_rate": 4.984319398395308e-06, + "loss": 0.7596, + "step": 676 + }, + { + "epoch": 0.24993846911149398, + "grad_norm": 0.9380325675010681, + "learning_rate": 4.984264961835436e-06, + "loss": 0.799, + "step": 677 + }, + { + "epoch": 0.25030765444253017, + "grad_norm": 0.9391986131668091, + "learning_rate": 4.98421043124692e-06, + "loss": 0.7435, + "step": 678 + }, + { + "epoch": 0.2506768397735663, + "grad_norm": 1.0003539323806763, + "learning_rate": 4.984155806631823e-06, + "loss": 0.7798, + "step": 679 + }, + { + "epoch": 0.2510460251046025, + "grad_norm": 0.9141901731491089, + "learning_rate": 4.984101087992212e-06, + "loss": 0.7093, + "step": 680 + }, + { + "epoch": 0.2514152104356387, + "grad_norm": 1.0558290481567383, + "learning_rate": 4.98404627533016e-06, + "loss": 0.7651, + "step": 681 + }, + { + "epoch": 0.2517843957666749, + "grad_norm": 0.8751945495605469, + "learning_rate": 4.98399136864774e-06, + "loss": 0.7598, + "step": 682 + }, + { + "epoch": 0.2521535810977111, + "grad_norm": 0.8984244465827942, + "learning_rate": 4.9839363679470296e-06, + "loss": 0.7921, + "step": 683 + }, + { + "epoch": 0.2525227664287472, + "grad_norm": 0.9515509605407715, + "learning_rate": 4.9838812732301134e-06, + "loss": 0.7781, + "step": 684 + }, + { + "epoch": 0.2528919517597834, + "grad_norm": 0.9040102958679199, + "learning_rate": 4.983826084499074e-06, + "loss": 0.7435, + "step": 685 + }, + { + "epoch": 0.2532611370908196, + "grad_norm": 0.8813716769218445, + "learning_rate": 4.983770801756001e-06, + "loss": 0.7675, + "step": 686 + }, + { + "epoch": 0.2536303224218558, + "grad_norm": 0.9750944375991821, + "learning_rate": 4.983715425002987e-06, + "loss": 0.7389, + "step": 687 + }, + { + "epoch": 0.25399950775289193, + "grad_norm": 0.8693578839302063, + "learning_rate": 4.983659954242128e-06, + "loss": 0.7325, + "step": 688 + }, + { + "epoch": 0.2543686930839281, + "grad_norm": 0.934140145778656, + "learning_rate": 4.983604389475525e-06, + "loss": 0.8121, + "step": 689 + }, + { + "epoch": 0.2547378784149643, + "grad_norm": 0.9266787767410278, + "learning_rate": 4.983548730705278e-06, + "loss": 0.793, + "step": 690 + }, + { + "epoch": 0.2551070637460005, + "grad_norm": 0.9268523454666138, + "learning_rate": 4.9834929779334964e-06, + "loss": 0.7605, + "step": 691 + }, + { + "epoch": 0.2554762490770367, + "grad_norm": 0.9014139771461487, + "learning_rate": 4.983437131162289e-06, + "loss": 0.7518, + "step": 692 + }, + { + "epoch": 0.25584543440807284, + "grad_norm": 0.8949803113937378, + "learning_rate": 4.98338119039377e-06, + "loss": 0.7179, + "step": 693 + }, + { + "epoch": 0.25621461973910903, + "grad_norm": 0.9687701463699341, + "learning_rate": 4.983325155630056e-06, + "loss": 0.7602, + "step": 694 + }, + { + "epoch": 0.2565838050701452, + "grad_norm": 0.9221265316009521, + "learning_rate": 4.983269026873269e-06, + "loss": 0.7374, + "step": 695 + }, + { + "epoch": 0.2569529904011814, + "grad_norm": 0.9037137627601624, + "learning_rate": 4.983212804125533e-06, + "loss": 0.7424, + "step": 696 + }, + { + "epoch": 0.25732217573221755, + "grad_norm": 0.9276369214057922, + "learning_rate": 4.983156487388977e-06, + "loss": 0.8033, + "step": 697 + }, + { + "epoch": 0.25769136106325374, + "grad_norm": 0.964596152305603, + "learning_rate": 4.983100076665731e-06, + "loss": 0.8284, + "step": 698 + }, + { + "epoch": 0.25806054639428994, + "grad_norm": 0.9751665592193604, + "learning_rate": 4.983043571957931e-06, + "loss": 0.7919, + "step": 699 + }, + { + "epoch": 0.25842973172532613, + "grad_norm": 0.9727154970169067, + "learning_rate": 4.9829869732677146e-06, + "loss": 0.8094, + "step": 700 + }, + { + "epoch": 0.2587989170563623, + "grad_norm": 0.9635800719261169, + "learning_rate": 4.982930280597226e-06, + "loss": 0.7682, + "step": 701 + }, + { + "epoch": 0.25916810238739846, + "grad_norm": 0.8952401280403137, + "learning_rate": 4.98287349394861e-06, + "loss": 0.812, + "step": 702 + }, + { + "epoch": 0.25953728771843465, + "grad_norm": 0.905311107635498, + "learning_rate": 4.982816613324015e-06, + "loss": 0.7621, + "step": 703 + }, + { + "epoch": 0.25990647304947084, + "grad_norm": 0.9060837030410767, + "learning_rate": 4.982759638725595e-06, + "loss": 0.7343, + "step": 704 + }, + { + "epoch": 0.26027565838050704, + "grad_norm": 0.9590125679969788, + "learning_rate": 4.982702570155506e-06, + "loss": 0.7996, + "step": 705 + }, + { + "epoch": 0.2606448437115432, + "grad_norm": 0.9153867959976196, + "learning_rate": 4.9826454076159094e-06, + "loss": 0.7566, + "step": 706 + }, + { + "epoch": 0.26101402904257937, + "grad_norm": 0.8935747146606445, + "learning_rate": 4.982588151108966e-06, + "loss": 0.7675, + "step": 707 + }, + { + "epoch": 0.26138321437361556, + "grad_norm": 0.9266985058784485, + "learning_rate": 4.982530800636845e-06, + "loss": 0.7712, + "step": 708 + }, + { + "epoch": 0.26175239970465175, + "grad_norm": 0.9422087073326111, + "learning_rate": 4.982473356201718e-06, + "loss": 0.7768, + "step": 709 + }, + { + "epoch": 0.26212158503568794, + "grad_norm": 0.9119973182678223, + "learning_rate": 4.982415817805757e-06, + "loss": 0.7693, + "step": 710 + }, + { + "epoch": 0.2624907703667241, + "grad_norm": 0.8853102326393127, + "learning_rate": 4.982358185451141e-06, + "loss": 0.75, + "step": 711 + }, + { + "epoch": 0.2628599556977603, + "grad_norm": 0.9192734360694885, + "learning_rate": 4.982300459140051e-06, + "loss": 0.7715, + "step": 712 + }, + { + "epoch": 0.26322914102879647, + "grad_norm": 0.8763948082923889, + "learning_rate": 4.982242638874672e-06, + "loss": 0.7635, + "step": 713 + }, + { + "epoch": 0.26359832635983266, + "grad_norm": 0.8886423707008362, + "learning_rate": 4.982184724657192e-06, + "loss": 0.753, + "step": 714 + }, + { + "epoch": 0.2639675116908688, + "grad_norm": 0.9051578044891357, + "learning_rate": 4.9821267164898045e-06, + "loss": 0.7759, + "step": 715 + }, + { + "epoch": 0.264336697021905, + "grad_norm": 0.9290419220924377, + "learning_rate": 4.9820686143747045e-06, + "loss": 0.8023, + "step": 716 + }, + { + "epoch": 0.2647058823529412, + "grad_norm": 0.9183179140090942, + "learning_rate": 4.982010418314089e-06, + "loss": 0.7909, + "step": 717 + }, + { + "epoch": 0.2650750676839774, + "grad_norm": 0.9106544256210327, + "learning_rate": 4.981952128310165e-06, + "loss": 0.7534, + "step": 718 + }, + { + "epoch": 0.2654442530150135, + "grad_norm": 0.8992197513580322, + "learning_rate": 4.981893744365134e-06, + "loss": 0.7368, + "step": 719 + }, + { + "epoch": 0.2658134383460497, + "grad_norm": 0.8842350840568542, + "learning_rate": 4.981835266481209e-06, + "loss": 0.762, + "step": 720 + }, + { + "epoch": 0.2661826236770859, + "grad_norm": 0.9051290154457092, + "learning_rate": 4.9817766946606025e-06, + "loss": 0.7509, + "step": 721 + }, + { + "epoch": 0.2665518090081221, + "grad_norm": 0.9140417575836182, + "learning_rate": 4.9817180289055314e-06, + "loss": 0.7554, + "step": 722 + }, + { + "epoch": 0.2669209943391583, + "grad_norm": 0.8826530575752258, + "learning_rate": 4.981659269218216e-06, + "loss": 0.7498, + "step": 723 + }, + { + "epoch": 0.2672901796701944, + "grad_norm": 0.8926510214805603, + "learning_rate": 4.98160041560088e-06, + "loss": 0.7614, + "step": 724 + }, + { + "epoch": 0.2676593650012306, + "grad_norm": 0.9062178134918213, + "learning_rate": 4.9815414680557514e-06, + "loss": 0.7644, + "step": 725 + }, + { + "epoch": 0.2680285503322668, + "grad_norm": 0.9631175994873047, + "learning_rate": 4.981482426585063e-06, + "loss": 0.7816, + "step": 726 + }, + { + "epoch": 0.268397735663303, + "grad_norm": 0.9016267657279968, + "learning_rate": 4.981423291191047e-06, + "loss": 0.7367, + "step": 727 + }, + { + "epoch": 0.26876692099433913, + "grad_norm": 0.8919848799705505, + "learning_rate": 4.981364061875942e-06, + "loss": 0.7523, + "step": 728 + }, + { + "epoch": 0.2691361063253753, + "grad_norm": 0.8961866497993469, + "learning_rate": 4.981304738641991e-06, + "loss": 0.7743, + "step": 729 + }, + { + "epoch": 0.2695052916564115, + "grad_norm": 1.0036101341247559, + "learning_rate": 4.981245321491438e-06, + "loss": 0.7526, + "step": 730 + }, + { + "epoch": 0.2698744769874477, + "grad_norm": 0.9046428203582764, + "learning_rate": 4.9811858104265334e-06, + "loss": 0.7522, + "step": 731 + }, + { + "epoch": 0.2702436623184839, + "grad_norm": 0.9048503041267395, + "learning_rate": 4.981126205449529e-06, + "loss": 0.7233, + "step": 732 + }, + { + "epoch": 0.27061284764952004, + "grad_norm": 1.0362783670425415, + "learning_rate": 4.9810665065626805e-06, + "loss": 0.751, + "step": 733 + }, + { + "epoch": 0.27098203298055623, + "grad_norm": 0.9431849122047424, + "learning_rate": 4.981006713768248e-06, + "loss": 0.7831, + "step": 734 + }, + { + "epoch": 0.2713512183115924, + "grad_norm": 0.9257729649543762, + "learning_rate": 4.980946827068494e-06, + "loss": 0.7801, + "step": 735 + }, + { + "epoch": 0.2717204036426286, + "grad_norm": 0.8980259299278259, + "learning_rate": 4.980886846465686e-06, + "loss": 0.7452, + "step": 736 + }, + { + "epoch": 0.27208958897366475, + "grad_norm": 0.895393967628479, + "learning_rate": 4.980826771962094e-06, + "loss": 0.72, + "step": 737 + }, + { + "epoch": 0.27245877430470095, + "grad_norm": 0.8936492800712585, + "learning_rate": 4.980766603559991e-06, + "loss": 0.7668, + "step": 738 + }, + { + "epoch": 0.27282795963573714, + "grad_norm": 0.8899438977241516, + "learning_rate": 4.980706341261655e-06, + "loss": 0.7224, + "step": 739 + }, + { + "epoch": 0.27319714496677333, + "grad_norm": 0.9478532075881958, + "learning_rate": 4.980645985069367e-06, + "loss": 0.7859, + "step": 740 + }, + { + "epoch": 0.2735663302978095, + "grad_norm": 0.9106540083885193, + "learning_rate": 4.980585534985412e-06, + "loss": 0.7769, + "step": 741 + }, + { + "epoch": 0.27393551562884566, + "grad_norm": 0.9290282130241394, + "learning_rate": 4.9805249910120776e-06, + "loss": 0.7358, + "step": 742 + }, + { + "epoch": 0.27430470095988185, + "grad_norm": 0.9177589416503906, + "learning_rate": 4.980464353151654e-06, + "loss": 0.7647, + "step": 743 + }, + { + "epoch": 0.27467388629091805, + "grad_norm": 0.900225818157196, + "learning_rate": 4.980403621406439e-06, + "loss": 0.7416, + "step": 744 + }, + { + "epoch": 0.27504307162195424, + "grad_norm": 0.905703604221344, + "learning_rate": 4.980342795778728e-06, + "loss": 0.8068, + "step": 745 + }, + { + "epoch": 0.2754122569529904, + "grad_norm": 0.8948282599449158, + "learning_rate": 4.980281876270826e-06, + "loss": 0.7597, + "step": 746 + }, + { + "epoch": 0.27578144228402657, + "grad_norm": 0.9245081543922424, + "learning_rate": 4.980220862885038e-06, + "loss": 0.7406, + "step": 747 + }, + { + "epoch": 0.27615062761506276, + "grad_norm": 0.8864959478378296, + "learning_rate": 4.980159755623673e-06, + "loss": 0.7464, + "step": 748 + }, + { + "epoch": 0.27651981294609895, + "grad_norm": 0.9063557386398315, + "learning_rate": 4.9800985544890425e-06, + "loss": 0.77, + "step": 749 + }, + { + "epoch": 0.27688899827713515, + "grad_norm": 0.8840070962905884, + "learning_rate": 4.9800372594834656e-06, + "loss": 0.7356, + "step": 750 + }, + { + "epoch": 0.2772581836081713, + "grad_norm": 0.8699694275856018, + "learning_rate": 4.979975870609261e-06, + "loss": 0.747, + "step": 751 + }, + { + "epoch": 0.2776273689392075, + "grad_norm": 0.9077945351600647, + "learning_rate": 4.979914387868753e-06, + "loss": 0.7428, + "step": 752 + }, + { + "epoch": 0.27799655427024367, + "grad_norm": 0.9290851354598999, + "learning_rate": 4.979852811264267e-06, + "loss": 0.7509, + "step": 753 + }, + { + "epoch": 0.27836573960127986, + "grad_norm": 0.928439199924469, + "learning_rate": 4.979791140798136e-06, + "loss": 0.7949, + "step": 754 + }, + { + "epoch": 0.278734924932316, + "grad_norm": 0.8840503692626953, + "learning_rate": 4.9797293764726924e-06, + "loss": 0.7167, + "step": 755 + }, + { + "epoch": 0.2791041102633522, + "grad_norm": 0.9051108360290527, + "learning_rate": 4.979667518290274e-06, + "loss": 0.7636, + "step": 756 + }, + { + "epoch": 0.2794732955943884, + "grad_norm": 0.9400845766067505, + "learning_rate": 4.979605566253224e-06, + "loss": 0.7595, + "step": 757 + }, + { + "epoch": 0.2798424809254246, + "grad_norm": 0.9284423589706421, + "learning_rate": 4.979543520363884e-06, + "loss": 0.728, + "step": 758 + }, + { + "epoch": 0.28021166625646077, + "grad_norm": 0.8976213932037354, + "learning_rate": 4.979481380624606e-06, + "loss": 0.7399, + "step": 759 + }, + { + "epoch": 0.2805808515874969, + "grad_norm": 0.8708903193473816, + "learning_rate": 4.97941914703774e-06, + "loss": 0.7534, + "step": 760 + }, + { + "epoch": 0.2809500369185331, + "grad_norm": 0.9106154441833496, + "learning_rate": 4.979356819605641e-06, + "loss": 0.7552, + "step": 761 + }, + { + "epoch": 0.2813192222495693, + "grad_norm": 0.9194098711013794, + "learning_rate": 4.979294398330668e-06, + "loss": 0.7365, + "step": 762 + }, + { + "epoch": 0.2816884075806055, + "grad_norm": 0.9231695532798767, + "learning_rate": 4.9792318832151864e-06, + "loss": 0.7607, + "step": 763 + }, + { + "epoch": 0.2820575929116416, + "grad_norm": 0.9608139991760254, + "learning_rate": 4.97916927426156e-06, + "loss": 0.7909, + "step": 764 + }, + { + "epoch": 0.2824267782426778, + "grad_norm": 0.9300723075866699, + "learning_rate": 4.979106571472159e-06, + "loss": 0.7453, + "step": 765 + }, + { + "epoch": 0.282795963573714, + "grad_norm": 0.9103767275810242, + "learning_rate": 4.979043774849356e-06, + "loss": 0.7816, + "step": 766 + }, + { + "epoch": 0.2831651489047502, + "grad_norm": 0.9026121497154236, + "learning_rate": 4.9789808843955294e-06, + "loss": 0.7656, + "step": 767 + }, + { + "epoch": 0.2835343342357864, + "grad_norm": 0.8946061730384827, + "learning_rate": 4.978917900113059e-06, + "loss": 0.7399, + "step": 768 + }, + { + "epoch": 0.2839035195668225, + "grad_norm": 0.8899773955345154, + "learning_rate": 4.978854822004327e-06, + "loss": 0.7666, + "step": 769 + }, + { + "epoch": 0.2842727048978587, + "grad_norm": 0.9117376804351807, + "learning_rate": 4.978791650071723e-06, + "loss": 0.7552, + "step": 770 + }, + { + "epoch": 0.2846418902288949, + "grad_norm": 0.8872507810592651, + "learning_rate": 4.978728384317637e-06, + "loss": 0.7367, + "step": 771 + }, + { + "epoch": 0.2850110755599311, + "grad_norm": 0.9358695149421692, + "learning_rate": 4.978665024744465e-06, + "loss": 0.7751, + "step": 772 + }, + { + "epoch": 0.28538026089096724, + "grad_norm": 0.879411518573761, + "learning_rate": 4.9786015713546035e-06, + "loss": 0.7681, + "step": 773 + }, + { + "epoch": 0.28574944622200343, + "grad_norm": 0.9180899262428284, + "learning_rate": 4.978538024150455e-06, + "loss": 0.7481, + "step": 774 + }, + { + "epoch": 0.2861186315530396, + "grad_norm": 0.8577932119369507, + "learning_rate": 4.978474383134424e-06, + "loss": 0.7133, + "step": 775 + }, + { + "epoch": 0.2864878168840758, + "grad_norm": 0.8630688190460205, + "learning_rate": 4.97841064830892e-06, + "loss": 0.7232, + "step": 776 + }, + { + "epoch": 0.286857002215112, + "grad_norm": 0.902454137802124, + "learning_rate": 4.978346819676355e-06, + "loss": 0.7438, + "step": 777 + }, + { + "epoch": 0.28722618754614815, + "grad_norm": 0.9192384481430054, + "learning_rate": 4.9782828972391466e-06, + "loss": 0.8082, + "step": 778 + }, + { + "epoch": 0.28759537287718434, + "grad_norm": 0.9017093777656555, + "learning_rate": 4.9782188809997106e-06, + "loss": 0.7713, + "step": 779 + }, + { + "epoch": 0.28796455820822053, + "grad_norm": 0.8741960525512695, + "learning_rate": 4.978154770960473e-06, + "loss": 0.7601, + "step": 780 + }, + { + "epoch": 0.2883337435392567, + "grad_norm": 0.8619513511657715, + "learning_rate": 4.978090567123859e-06, + "loss": 0.7267, + "step": 781 + }, + { + "epoch": 0.28870292887029286, + "grad_norm": 0.9079142808914185, + "learning_rate": 4.9780262694923e-06, + "loss": 0.7804, + "step": 782 + }, + { + "epoch": 0.28907211420132906, + "grad_norm": 0.8758432865142822, + "learning_rate": 4.977961878068228e-06, + "loss": 0.7271, + "step": 783 + }, + { + "epoch": 0.28944129953236525, + "grad_norm": 0.8986886143684387, + "learning_rate": 4.977897392854081e-06, + "loss": 0.7885, + "step": 784 + }, + { + "epoch": 0.28981048486340144, + "grad_norm": 0.9059154391288757, + "learning_rate": 4.9778328138523e-06, + "loss": 0.7367, + "step": 785 + }, + { + "epoch": 0.29017967019443763, + "grad_norm": 0.9500458240509033, + "learning_rate": 4.9777681410653295e-06, + "loss": 0.7784, + "step": 786 + }, + { + "epoch": 0.29054885552547377, + "grad_norm": 0.9947640299797058, + "learning_rate": 4.977703374495616e-06, + "loss": 0.7704, + "step": 787 + }, + { + "epoch": 0.29091804085650996, + "grad_norm": 0.9205284118652344, + "learning_rate": 4.977638514145612e-06, + "loss": 0.7906, + "step": 788 + }, + { + "epoch": 0.29128722618754616, + "grad_norm": 0.925631582736969, + "learning_rate": 4.977573560017772e-06, + "loss": 0.7847, + "step": 789 + }, + { + "epoch": 0.29165641151858235, + "grad_norm": 0.96160888671875, + "learning_rate": 4.977508512114556e-06, + "loss": 0.7993, + "step": 790 + }, + { + "epoch": 0.2920255968496185, + "grad_norm": 0.9323878884315491, + "learning_rate": 4.977443370438423e-06, + "loss": 0.7811, + "step": 791 + }, + { + "epoch": 0.2923947821806547, + "grad_norm": 0.9564676880836487, + "learning_rate": 4.977378134991841e-06, + "loss": 0.7759, + "step": 792 + }, + { + "epoch": 0.29276396751169087, + "grad_norm": 0.8873472213745117, + "learning_rate": 4.977312805777279e-06, + "loss": 0.7757, + "step": 793 + }, + { + "epoch": 0.29313315284272706, + "grad_norm": 0.9213703274726868, + "learning_rate": 4.977247382797208e-06, + "loss": 0.7134, + "step": 794 + }, + { + "epoch": 0.29350233817376326, + "grad_norm": 0.9845170974731445, + "learning_rate": 4.977181866054106e-06, + "loss": 0.7865, + "step": 795 + }, + { + "epoch": 0.2938715235047994, + "grad_norm": 0.9223014116287231, + "learning_rate": 4.977116255550452e-06, + "loss": 0.7784, + "step": 796 + }, + { + "epoch": 0.2942407088358356, + "grad_norm": 0.8821209073066711, + "learning_rate": 4.97705055128873e-06, + "loss": 0.7164, + "step": 797 + }, + { + "epoch": 0.2946098941668718, + "grad_norm": 0.8854329586029053, + "learning_rate": 4.976984753271427e-06, + "loss": 0.7593, + "step": 798 + }, + { + "epoch": 0.29497907949790797, + "grad_norm": 0.9611573219299316, + "learning_rate": 4.976918861501031e-06, + "loss": 0.7343, + "step": 799 + }, + { + "epoch": 0.2953482648289441, + "grad_norm": 0.9474232792854309, + "learning_rate": 4.976852875980039e-06, + "loss": 0.7501, + "step": 800 + }, + { + "epoch": 0.2957174501599803, + "grad_norm": 0.9252268671989441, + "learning_rate": 4.976786796710947e-06, + "loss": 0.7608, + "step": 801 + }, + { + "epoch": 0.2960866354910165, + "grad_norm": 0.897132396697998, + "learning_rate": 4.976720623696257e-06, + "loss": 0.7549, + "step": 802 + }, + { + "epoch": 0.2964558208220527, + "grad_norm": 0.8893219828605652, + "learning_rate": 4.976654356938472e-06, + "loss": 0.7268, + "step": 803 + }, + { + "epoch": 0.2968250061530889, + "grad_norm": 0.9587628841400146, + "learning_rate": 4.976587996440102e-06, + "loss": 0.7845, + "step": 804 + }, + { + "epoch": 0.297194191484125, + "grad_norm": 0.8750391006469727, + "learning_rate": 4.976521542203658e-06, + "loss": 0.7305, + "step": 805 + }, + { + "epoch": 0.2975633768151612, + "grad_norm": 0.9181191921234131, + "learning_rate": 4.976454994231656e-06, + "loss": 0.7969, + "step": 806 + }, + { + "epoch": 0.2979325621461974, + "grad_norm": 0.934037983417511, + "learning_rate": 4.976388352526612e-06, + "loss": 0.7438, + "step": 807 + }, + { + "epoch": 0.2983017474772336, + "grad_norm": 0.9424565434455872, + "learning_rate": 4.976321617091052e-06, + "loss": 0.7631, + "step": 808 + }, + { + "epoch": 0.29867093280826973, + "grad_norm": 0.9459184408187866, + "learning_rate": 4.976254787927499e-06, + "loss": 0.7616, + "step": 809 + }, + { + "epoch": 0.2990401181393059, + "grad_norm": 0.912196934223175, + "learning_rate": 4.976187865038485e-06, + "loss": 0.7814, + "step": 810 + }, + { + "epoch": 0.2994093034703421, + "grad_norm": 0.8629269003868103, + "learning_rate": 4.976120848426542e-06, + "loss": 0.7365, + "step": 811 + }, + { + "epoch": 0.2997784888013783, + "grad_norm": 0.9021220207214355, + "learning_rate": 4.9760537380942055e-06, + "loss": 0.7271, + "step": 812 + }, + { + "epoch": 0.3001476741324145, + "grad_norm": 0.8790842294692993, + "learning_rate": 4.975986534044017e-06, + "loss": 0.7211, + "step": 813 + }, + { + "epoch": 0.30051685946345064, + "grad_norm": 0.9232593178749084, + "learning_rate": 4.975919236278519e-06, + "loss": 0.7557, + "step": 814 + }, + { + "epoch": 0.30088604479448683, + "grad_norm": 0.9208589792251587, + "learning_rate": 4.975851844800259e-06, + "loss": 0.7585, + "step": 815 + }, + { + "epoch": 0.301255230125523, + "grad_norm": 0.8960398435592651, + "learning_rate": 4.9757843596117894e-06, + "loss": 0.81, + "step": 816 + }, + { + "epoch": 0.3016244154565592, + "grad_norm": 0.9213392734527588, + "learning_rate": 4.975716780715662e-06, + "loss": 0.7421, + "step": 817 + }, + { + "epoch": 0.30199360078759535, + "grad_norm": 0.882556140422821, + "learning_rate": 4.975649108114437e-06, + "loss": 0.7653, + "step": 818 + }, + { + "epoch": 0.30236278611863154, + "grad_norm": 0.8911692500114441, + "learning_rate": 4.9755813418106735e-06, + "loss": 0.7357, + "step": 819 + }, + { + "epoch": 0.30273197144966774, + "grad_norm": 0.9037520885467529, + "learning_rate": 4.975513481806939e-06, + "loss": 0.7503, + "step": 820 + }, + { + "epoch": 0.30310115678070393, + "grad_norm": 0.9327360391616821, + "learning_rate": 4.975445528105799e-06, + "loss": 0.7269, + "step": 821 + }, + { + "epoch": 0.3034703421117401, + "grad_norm": 0.8998024463653564, + "learning_rate": 4.9753774807098275e-06, + "loss": 0.7384, + "step": 822 + }, + { + "epoch": 0.30383952744277626, + "grad_norm": 0.9088361859321594, + "learning_rate": 4.975309339621599e-06, + "loss": 0.7571, + "step": 823 + }, + { + "epoch": 0.30420871277381245, + "grad_norm": 0.8943783640861511, + "learning_rate": 4.975241104843694e-06, + "loss": 0.7007, + "step": 824 + }, + { + "epoch": 0.30457789810484864, + "grad_norm": 0.9370293021202087, + "learning_rate": 4.975172776378694e-06, + "loss": 0.746, + "step": 825 + }, + { + "epoch": 0.30494708343588484, + "grad_norm": 0.946474552154541, + "learning_rate": 4.9751043542291854e-06, + "loss": 0.7987, + "step": 826 + }, + { + "epoch": 0.305316268766921, + "grad_norm": 0.9123954772949219, + "learning_rate": 4.975035838397759e-06, + "loss": 0.7611, + "step": 827 + }, + { + "epoch": 0.30568545409795717, + "grad_norm": 0.9167425632476807, + "learning_rate": 4.974967228887007e-06, + "loss": 0.7794, + "step": 828 + }, + { + "epoch": 0.30605463942899336, + "grad_norm": 0.9110398292541504, + "learning_rate": 4.974898525699526e-06, + "loss": 0.763, + "step": 829 + }, + { + "epoch": 0.30642382476002955, + "grad_norm": 0.9574660062789917, + "learning_rate": 4.974829728837917e-06, + "loss": 0.7552, + "step": 830 + }, + { + "epoch": 0.3067930100910657, + "grad_norm": 0.9426335096359253, + "learning_rate": 4.974760838304784e-06, + "loss": 0.7404, + "step": 831 + }, + { + "epoch": 0.3071621954221019, + "grad_norm": 0.9049092531204224, + "learning_rate": 4.974691854102734e-06, + "loss": 0.7394, + "step": 832 + }, + { + "epoch": 0.3075313807531381, + "grad_norm": 0.8891183733940125, + "learning_rate": 4.974622776234379e-06, + "loss": 0.734, + "step": 833 + }, + { + "epoch": 0.30790056608417427, + "grad_norm": 0.8763934373855591, + "learning_rate": 4.974553604702332e-06, + "loss": 0.7341, + "step": 834 + }, + { + "epoch": 0.30826975141521046, + "grad_norm": 0.9248993396759033, + "learning_rate": 4.974484339509213e-06, + "loss": 0.7694, + "step": 835 + }, + { + "epoch": 0.3086389367462466, + "grad_norm": 0.8956395983695984, + "learning_rate": 4.974414980657642e-06, + "loss": 0.7665, + "step": 836 + }, + { + "epoch": 0.3090081220772828, + "grad_norm": 0.8790796399116516, + "learning_rate": 4.974345528150245e-06, + "loss": 0.7492, + "step": 837 + }, + { + "epoch": 0.309377307408319, + "grad_norm": 0.9157189726829529, + "learning_rate": 4.974275981989651e-06, + "loss": 0.7643, + "step": 838 + }, + { + "epoch": 0.3097464927393552, + "grad_norm": 0.9237195253372192, + "learning_rate": 4.974206342178492e-06, + "loss": 0.755, + "step": 839 + }, + { + "epoch": 0.3101156780703913, + "grad_norm": 0.8979294896125793, + "learning_rate": 4.974136608719404e-06, + "loss": 0.8043, + "step": 840 + }, + { + "epoch": 0.3104848634014275, + "grad_norm": 0.9376009702682495, + "learning_rate": 4.974066781615026e-06, + "loss": 0.7899, + "step": 841 + }, + { + "epoch": 0.3108540487324637, + "grad_norm": 0.926584780216217, + "learning_rate": 4.973996860868001e-06, + "loss": 0.7436, + "step": 842 + }, + { + "epoch": 0.3112232340634999, + "grad_norm": 0.8795569539070129, + "learning_rate": 4.973926846480975e-06, + "loss": 0.7399, + "step": 843 + }, + { + "epoch": 0.3115924193945361, + "grad_norm": 0.9180058836936951, + "learning_rate": 4.973856738456599e-06, + "loss": 0.7332, + "step": 844 + }, + { + "epoch": 0.3119616047255722, + "grad_norm": 0.9014691710472107, + "learning_rate": 4.973786536797527e-06, + "loss": 0.719, + "step": 845 + }, + { + "epoch": 0.3123307900566084, + "grad_norm": 0.9873002767562866, + "learning_rate": 4.973716241506415e-06, + "loss": 0.7921, + "step": 846 + }, + { + "epoch": 0.3126999753876446, + "grad_norm": 0.8903076648712158, + "learning_rate": 4.973645852585923e-06, + "loss": 0.7645, + "step": 847 + }, + { + "epoch": 0.3130691607186808, + "grad_norm": 0.8644299507141113, + "learning_rate": 4.973575370038718e-06, + "loss": 0.7022, + "step": 848 + }, + { + "epoch": 0.31343834604971693, + "grad_norm": 0.9160073399543762, + "learning_rate": 4.973504793867465e-06, + "loss": 0.7261, + "step": 849 + }, + { + "epoch": 0.3138075313807531, + "grad_norm": 0.937968373298645, + "learning_rate": 4.973434124074836e-06, + "loss": 0.7851, + "step": 850 + }, + { + "epoch": 0.3141767167117893, + "grad_norm": 0.95966637134552, + "learning_rate": 4.973363360663506e-06, + "loss": 0.7395, + "step": 851 + }, + { + "epoch": 0.3145459020428255, + "grad_norm": 0.8805307745933533, + "learning_rate": 4.973292503636154e-06, + "loss": 0.7323, + "step": 852 + }, + { + "epoch": 0.3149150873738617, + "grad_norm": 0.9085668921470642, + "learning_rate": 4.97322155299546e-06, + "loss": 0.6853, + "step": 853 + }, + { + "epoch": 0.31528427270489784, + "grad_norm": 0.9099245071411133, + "learning_rate": 4.973150508744111e-06, + "loss": 0.7732, + "step": 854 + }, + { + "epoch": 0.31565345803593403, + "grad_norm": 0.8982505202293396, + "learning_rate": 4.973079370884797e-06, + "loss": 0.7127, + "step": 855 + }, + { + "epoch": 0.3160226433669702, + "grad_norm": 0.9104130864143372, + "learning_rate": 4.973008139420209e-06, + "loss": 0.7405, + "step": 856 + }, + { + "epoch": 0.3163918286980064, + "grad_norm": 0.8904514908790588, + "learning_rate": 4.9729368143530435e-06, + "loss": 0.7901, + "step": 857 + }, + { + "epoch": 0.31676101402904255, + "grad_norm": 0.9432110786437988, + "learning_rate": 4.9728653956859995e-06, + "loss": 0.7735, + "step": 858 + }, + { + "epoch": 0.31713019936007875, + "grad_norm": 0.9377402067184448, + "learning_rate": 4.97279388342178e-06, + "loss": 0.7634, + "step": 859 + }, + { + "epoch": 0.31749938469111494, + "grad_norm": 0.8846672177314758, + "learning_rate": 4.972722277563094e-06, + "loss": 0.7373, + "step": 860 + }, + { + "epoch": 0.31786857002215113, + "grad_norm": 0.907082200050354, + "learning_rate": 4.97265057811265e-06, + "loss": 0.746, + "step": 861 + }, + { + "epoch": 0.3182377553531873, + "grad_norm": 0.92213374376297, + "learning_rate": 4.972578785073161e-06, + "loss": 0.7361, + "step": 862 + }, + { + "epoch": 0.31860694068422346, + "grad_norm": 0.950560986995697, + "learning_rate": 4.972506898447346e-06, + "loss": 0.8017, + "step": 863 + }, + { + "epoch": 0.31897612601525965, + "grad_norm": 0.880368709564209, + "learning_rate": 4.972434918237925e-06, + "loss": 0.7593, + "step": 864 + }, + { + "epoch": 0.31934531134629585, + "grad_norm": 1.2160859107971191, + "learning_rate": 4.972362844447623e-06, + "loss": 0.7608, + "step": 865 + }, + { + "epoch": 0.31971449667733204, + "grad_norm": 0.9254541993141174, + "learning_rate": 4.972290677079168e-06, + "loss": 0.6959, + "step": 866 + }, + { + "epoch": 0.3200836820083682, + "grad_norm": 0.8902239203453064, + "learning_rate": 4.97221841613529e-06, + "loss": 0.7166, + "step": 867 + }, + { + "epoch": 0.32045286733940437, + "grad_norm": 0.9283355474472046, + "learning_rate": 4.972146061618726e-06, + "loss": 0.8209, + "step": 868 + }, + { + "epoch": 0.32082205267044056, + "grad_norm": 0.906304657459259, + "learning_rate": 4.972073613532214e-06, + "loss": 0.7639, + "step": 869 + }, + { + "epoch": 0.32119123800147675, + "grad_norm": 0.9373779892921448, + "learning_rate": 4.972001071878495e-06, + "loss": 0.7908, + "step": 870 + }, + { + "epoch": 0.32156042333251295, + "grad_norm": 0.9056022763252258, + "learning_rate": 4.971928436660316e-06, + "loss": 0.768, + "step": 871 + }, + { + "epoch": 0.3219296086635491, + "grad_norm": 0.8719453811645508, + "learning_rate": 4.971855707880426e-06, + "loss": 0.7401, + "step": 872 + }, + { + "epoch": 0.3222987939945853, + "grad_norm": 0.9360424876213074, + "learning_rate": 4.971782885541578e-06, + "loss": 0.777, + "step": 873 + }, + { + "epoch": 0.32266797932562147, + "grad_norm": 0.8848892450332642, + "learning_rate": 4.971709969646527e-06, + "loss": 0.6993, + "step": 874 + }, + { + "epoch": 0.32303716465665766, + "grad_norm": 0.8988775610923767, + "learning_rate": 4.971636960198033e-06, + "loss": 0.7392, + "step": 875 + }, + { + "epoch": 0.3234063499876938, + "grad_norm": 0.901785135269165, + "learning_rate": 4.971563857198862e-06, + "loss": 0.7618, + "step": 876 + }, + { + "epoch": 0.32377553531873, + "grad_norm": 0.8972448110580444, + "learning_rate": 4.971490660651778e-06, + "loss": 0.7159, + "step": 877 + }, + { + "epoch": 0.3241447206497662, + "grad_norm": 0.9601827263832092, + "learning_rate": 4.971417370559552e-06, + "loss": 0.748, + "step": 878 + }, + { + "epoch": 0.3245139059808024, + "grad_norm": 0.8916758894920349, + "learning_rate": 4.97134398692496e-06, + "loss": 0.7598, + "step": 879 + }, + { + "epoch": 0.32488309131183857, + "grad_norm": 0.8835681080818176, + "learning_rate": 4.971270509750778e-06, + "loss": 0.6888, + "step": 880 + }, + { + "epoch": 0.3252522766428747, + "grad_norm": 0.9150434732437134, + "learning_rate": 4.971196939039786e-06, + "loss": 0.7612, + "step": 881 + }, + { + "epoch": 0.3256214619739109, + "grad_norm": 0.8889224529266357, + "learning_rate": 4.97112327479477e-06, + "loss": 0.7409, + "step": 882 + }, + { + "epoch": 0.3259906473049471, + "grad_norm": 0.8889238834381104, + "learning_rate": 4.971049517018518e-06, + "loss": 0.7537, + "step": 883 + }, + { + "epoch": 0.3263598326359833, + "grad_norm": 0.9054772853851318, + "learning_rate": 4.970975665713822e-06, + "loss": 0.7535, + "step": 884 + }, + { + "epoch": 0.3267290179670194, + "grad_norm": 0.9268242716789246, + "learning_rate": 4.970901720883477e-06, + "loss": 0.7688, + "step": 885 + }, + { + "epoch": 0.3270982032980556, + "grad_norm": 0.9592145681381226, + "learning_rate": 4.970827682530282e-06, + "loss": 0.751, + "step": 886 + }, + { + "epoch": 0.3274673886290918, + "grad_norm": 0.9229305386543274, + "learning_rate": 4.970753550657038e-06, + "loss": 0.755, + "step": 887 + }, + { + "epoch": 0.327836573960128, + "grad_norm": 0.9249312281608582, + "learning_rate": 4.970679325266552e-06, + "loss": 0.7411, + "step": 888 + }, + { + "epoch": 0.3282057592911642, + "grad_norm": 0.8924015164375305, + "learning_rate": 4.970605006361634e-06, + "loss": 0.7259, + "step": 889 + }, + { + "epoch": 0.3285749446222003, + "grad_norm": 0.8923146724700928, + "learning_rate": 4.970530593945096e-06, + "loss": 0.75, + "step": 890 + }, + { + "epoch": 0.3289441299532365, + "grad_norm": 0.9257709980010986, + "learning_rate": 4.9704560880197546e-06, + "loss": 0.7579, + "step": 891 + }, + { + "epoch": 0.3293133152842727, + "grad_norm": 0.8968879580497742, + "learning_rate": 4.97038148858843e-06, + "loss": 0.7173, + "step": 892 + }, + { + "epoch": 0.3296825006153089, + "grad_norm": 0.9248254299163818, + "learning_rate": 4.970306795653946e-06, + "loss": 0.7422, + "step": 893 + }, + { + "epoch": 0.33005168594634504, + "grad_norm": 0.8645913600921631, + "learning_rate": 4.970232009219129e-06, + "loss": 0.7232, + "step": 894 + }, + { + "epoch": 0.33042087127738123, + "grad_norm": 0.8754826188087463, + "learning_rate": 4.97015712928681e-06, + "loss": 0.7279, + "step": 895 + }, + { + "epoch": 0.3307900566084174, + "grad_norm": 0.8828722238540649, + "learning_rate": 4.970082155859823e-06, + "loss": 0.7185, + "step": 896 + }, + { + "epoch": 0.3311592419394536, + "grad_norm": 0.8933262825012207, + "learning_rate": 4.970007088941007e-06, + "loss": 0.7293, + "step": 897 + }, + { + "epoch": 0.3315284272704898, + "grad_norm": 0.908781886100769, + "learning_rate": 4.9699319285332016e-06, + "loss": 0.7549, + "step": 898 + }, + { + "epoch": 0.33189761260152595, + "grad_norm": 0.8825910091400146, + "learning_rate": 4.969856674639252e-06, + "loss": 0.767, + "step": 899 + }, + { + "epoch": 0.33226679793256214, + "grad_norm": 0.9013200402259827, + "learning_rate": 4.969781327262008e-06, + "loss": 0.7292, + "step": 900 + }, + { + "epoch": 0.33263598326359833, + "grad_norm": 0.944476306438446, + "learning_rate": 4.969705886404319e-06, + "loss": 0.7586, + "step": 901 + }, + { + "epoch": 0.3330051685946345, + "grad_norm": 0.9089605808258057, + "learning_rate": 4.9696303520690415e-06, + "loss": 0.7238, + "step": 902 + }, + { + "epoch": 0.33337435392567066, + "grad_norm": 0.88468998670578, + "learning_rate": 4.969554724259036e-06, + "loss": 0.7392, + "step": 903 + }, + { + "epoch": 0.33374353925670686, + "grad_norm": 0.9044007062911987, + "learning_rate": 4.969479002977162e-06, + "loss": 0.7202, + "step": 904 + }, + { + "epoch": 0.33411272458774305, + "grad_norm": 0.907281756401062, + "learning_rate": 4.969403188226288e-06, + "loss": 0.7362, + "step": 905 + }, + { + "epoch": 0.33448190991877924, + "grad_norm": 0.9365758895874023, + "learning_rate": 4.969327280009282e-06, + "loss": 0.7881, + "step": 906 + }, + { + "epoch": 0.33485109524981543, + "grad_norm": 0.9953451752662659, + "learning_rate": 4.969251278329018e-06, + "loss": 0.7516, + "step": 907 + }, + { + "epoch": 0.33522028058085157, + "grad_norm": 0.9089345932006836, + "learning_rate": 4.969175183188373e-06, + "loss": 0.7508, + "step": 908 + }, + { + "epoch": 0.33558946591188776, + "grad_norm": 0.9271081686019897, + "learning_rate": 4.969098994590226e-06, + "loss": 0.7696, + "step": 909 + }, + { + "epoch": 0.33595865124292396, + "grad_norm": 0.9360471367835999, + "learning_rate": 4.9690227125374615e-06, + "loss": 0.7608, + "step": 910 + }, + { + "epoch": 0.33632783657396015, + "grad_norm": 0.9032567739486694, + "learning_rate": 4.968946337032967e-06, + "loss": 0.7514, + "step": 911 + }, + { + "epoch": 0.3366970219049963, + "grad_norm": 0.9877912998199463, + "learning_rate": 4.9688698680796325e-06, + "loss": 0.765, + "step": 912 + }, + { + "epoch": 0.3370662072360325, + "grad_norm": 0.9205021858215332, + "learning_rate": 4.9687933056803525e-06, + "loss": 0.7753, + "step": 913 + }, + { + "epoch": 0.33743539256706867, + "grad_norm": 0.8936699032783508, + "learning_rate": 4.968716649838025e-06, + "loss": 0.745, + "step": 914 + }, + { + "epoch": 0.33780457789810486, + "grad_norm": 0.9072567820549011, + "learning_rate": 4.968639900555552e-06, + "loss": 0.7801, + "step": 915 + }, + { + "epoch": 0.33817376322914106, + "grad_norm": 0.8919604420661926, + "learning_rate": 4.968563057835837e-06, + "loss": 0.7493, + "step": 916 + }, + { + "epoch": 0.3385429485601772, + "grad_norm": 0.9033501148223877, + "learning_rate": 4.96848612168179e-06, + "loss": 0.7606, + "step": 917 + }, + { + "epoch": 0.3389121338912134, + "grad_norm": 0.916556715965271, + "learning_rate": 4.968409092096322e-06, + "loss": 0.719, + "step": 918 + }, + { + "epoch": 0.3392813192222496, + "grad_norm": 0.9027720093727112, + "learning_rate": 4.968331969082349e-06, + "loss": 0.7027, + "step": 919 + }, + { + "epoch": 0.33965050455328577, + "grad_norm": 1.002319574356079, + "learning_rate": 4.96825475264279e-06, + "loss": 0.7999, + "step": 920 + }, + { + "epoch": 0.3400196898843219, + "grad_norm": 0.9688315391540527, + "learning_rate": 4.968177442780568e-06, + "loss": 0.8079, + "step": 921 + }, + { + "epoch": 0.3403888752153581, + "grad_norm": 0.9085078239440918, + "learning_rate": 4.968100039498609e-06, + "loss": 0.7369, + "step": 922 + }, + { + "epoch": 0.3407580605463943, + "grad_norm": 0.9328852295875549, + "learning_rate": 4.968022542799842e-06, + "loss": 0.7864, + "step": 923 + }, + { + "epoch": 0.3411272458774305, + "grad_norm": 0.9257019758224487, + "learning_rate": 4.9679449526872e-06, + "loss": 0.7838, + "step": 924 + }, + { + "epoch": 0.3414964312084667, + "grad_norm": 0.9182167053222656, + "learning_rate": 4.9678672691636214e-06, + "loss": 0.7348, + "step": 925 + }, + { + "epoch": 0.3418656165395028, + "grad_norm": 0.928878664970398, + "learning_rate": 4.967789492232046e-06, + "loss": 0.7564, + "step": 926 + }, + { + "epoch": 0.342234801870539, + "grad_norm": 0.8896941542625427, + "learning_rate": 4.967711621895416e-06, + "loss": 0.7435, + "step": 927 + }, + { + "epoch": 0.3426039872015752, + "grad_norm": 0.9109853506088257, + "learning_rate": 4.9676336581566795e-06, + "loss": 0.7818, + "step": 928 + }, + { + "epoch": 0.3429731725326114, + "grad_norm": 0.899246096611023, + "learning_rate": 4.96755560101879e-06, + "loss": 0.7404, + "step": 929 + }, + { + "epoch": 0.34334235786364753, + "grad_norm": 0.9120781421661377, + "learning_rate": 4.967477450484698e-06, + "loss": 0.7928, + "step": 930 + }, + { + "epoch": 0.3437115431946837, + "grad_norm": 0.9362387657165527, + "learning_rate": 4.967399206557363e-06, + "loss": 0.8073, + "step": 931 + }, + { + "epoch": 0.3440807285257199, + "grad_norm": 0.8933055996894836, + "learning_rate": 4.967320869239748e-06, + "loss": 0.756, + "step": 932 + }, + { + "epoch": 0.3444499138567561, + "grad_norm": 0.9109773635864258, + "learning_rate": 4.967242438534816e-06, + "loss": 0.7481, + "step": 933 + }, + { + "epoch": 0.34481909918779224, + "grad_norm": 0.9119990468025208, + "learning_rate": 4.967163914445537e-06, + "loss": 0.7529, + "step": 934 + }, + { + "epoch": 0.34518828451882844, + "grad_norm": 0.8959357142448425, + "learning_rate": 4.967085296974882e-06, + "loss": 0.7286, + "step": 935 + }, + { + "epoch": 0.34555746984986463, + "grad_norm": 0.9190980195999146, + "learning_rate": 4.967006586125827e-06, + "loss": 0.7311, + "step": 936 + }, + { + "epoch": 0.3459266551809008, + "grad_norm": 0.9075052738189697, + "learning_rate": 4.966927781901351e-06, + "loss": 0.7027, + "step": 937 + }, + { + "epoch": 0.346295840511937, + "grad_norm": 0.8960855603218079, + "learning_rate": 4.9668488843044375e-06, + "loss": 0.7864, + "step": 938 + }, + { + "epoch": 0.34666502584297315, + "grad_norm": 0.9005295038223267, + "learning_rate": 4.9667698933380724e-06, + "loss": 0.7036, + "step": 939 + }, + { + "epoch": 0.34703421117400934, + "grad_norm": 0.9100990891456604, + "learning_rate": 4.966690809005246e-06, + "loss": 0.7641, + "step": 940 + }, + { + "epoch": 0.34740339650504554, + "grad_norm": 0.8951278328895569, + "learning_rate": 4.96661163130895e-06, + "loss": 0.7563, + "step": 941 + }, + { + "epoch": 0.34777258183608173, + "grad_norm": 0.942742109298706, + "learning_rate": 4.966532360252182e-06, + "loss": 0.7392, + "step": 942 + }, + { + "epoch": 0.34814176716711787, + "grad_norm": 0.9036753177642822, + "learning_rate": 4.966452995837943e-06, + "loss": 0.7435, + "step": 943 + }, + { + "epoch": 0.34851095249815406, + "grad_norm": 0.9136203527450562, + "learning_rate": 4.966373538069236e-06, + "loss": 0.751, + "step": 944 + }, + { + "epoch": 0.34888013782919025, + "grad_norm": 0.9514570832252502, + "learning_rate": 4.96629398694907e-06, + "loss": 0.7368, + "step": 945 + }, + { + "epoch": 0.34924932316022644, + "grad_norm": 0.870691180229187, + "learning_rate": 4.966214342480455e-06, + "loss": 0.6951, + "step": 946 + }, + { + "epoch": 0.34961850849126264, + "grad_norm": 0.8581681847572327, + "learning_rate": 4.966134604666405e-06, + "loss": 0.6636, + "step": 947 + }, + { + "epoch": 0.3499876938222988, + "grad_norm": 0.912087619304657, + "learning_rate": 4.966054773509938e-06, + "loss": 0.7349, + "step": 948 + }, + { + "epoch": 0.35035687915333497, + "grad_norm": 0.9115909934043884, + "learning_rate": 4.965974849014078e-06, + "loss": 0.7449, + "step": 949 + }, + { + "epoch": 0.35072606448437116, + "grad_norm": 0.8997576832771301, + "learning_rate": 4.965894831181847e-06, + "loss": 0.7326, + "step": 950 + }, + { + "epoch": 0.35109524981540735, + "grad_norm": 0.8838664293289185, + "learning_rate": 4.965814720016274e-06, + "loss": 0.7337, + "step": 951 + }, + { + "epoch": 0.3514644351464435, + "grad_norm": 0.9265721440315247, + "learning_rate": 4.965734515520393e-06, + "loss": 0.733, + "step": 952 + }, + { + "epoch": 0.3518336204774797, + "grad_norm": 0.9667180776596069, + "learning_rate": 4.9656542176972386e-06, + "loss": 0.7494, + "step": 953 + }, + { + "epoch": 0.3522028058085159, + "grad_norm": 0.9301754236221313, + "learning_rate": 4.965573826549851e-06, + "loss": 0.7519, + "step": 954 + }, + { + "epoch": 0.35257199113955207, + "grad_norm": 0.9007225036621094, + "learning_rate": 4.965493342081271e-06, + "loss": 0.7306, + "step": 955 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.89749675989151, + "learning_rate": 4.965412764294547e-06, + "loss": 0.7551, + "step": 956 + }, + { + "epoch": 0.3533103618016244, + "grad_norm": 0.9183578491210938, + "learning_rate": 4.965332093192727e-06, + "loss": 0.7756, + "step": 957 + }, + { + "epoch": 0.3536795471326606, + "grad_norm": 0.9291596412658691, + "learning_rate": 4.9652513287788665e-06, + "loss": 0.721, + "step": 958 + }, + { + "epoch": 0.3540487324636968, + "grad_norm": 0.9414975047111511, + "learning_rate": 4.965170471056021e-06, + "loss": 0.7852, + "step": 959 + }, + { + "epoch": 0.354417917794733, + "grad_norm": 0.9100828766822815, + "learning_rate": 4.965089520027251e-06, + "loss": 0.7166, + "step": 960 + }, + { + "epoch": 0.3547871031257691, + "grad_norm": 0.8750141263008118, + "learning_rate": 4.96500847569562e-06, + "loss": 0.7535, + "step": 961 + }, + { + "epoch": 0.3551562884568053, + "grad_norm": 0.9412403702735901, + "learning_rate": 4.964927338064197e-06, + "loss": 0.7289, + "step": 962 + }, + { + "epoch": 0.3555254737878415, + "grad_norm": 0.9291063547134399, + "learning_rate": 4.964846107136052e-06, + "loss": 0.7615, + "step": 963 + }, + { + "epoch": 0.3558946591188777, + "grad_norm": 0.9080612063407898, + "learning_rate": 4.96476478291426e-06, + "loss": 0.7425, + "step": 964 + }, + { + "epoch": 0.3562638444499139, + "grad_norm": 0.9023758769035339, + "learning_rate": 4.9646833654018974e-06, + "loss": 0.761, + "step": 965 + }, + { + "epoch": 0.35663302978095, + "grad_norm": 0.9533596038818359, + "learning_rate": 4.964601854602049e-06, + "loss": 0.726, + "step": 966 + }, + { + "epoch": 0.3570022151119862, + "grad_norm": 0.9523007273674011, + "learning_rate": 4.964520250517798e-06, + "loss": 0.7443, + "step": 967 + }, + { + "epoch": 0.3573714004430224, + "grad_norm": 0.9014210104942322, + "learning_rate": 4.964438553152233e-06, + "loss": 0.7348, + "step": 968 + }, + { + "epoch": 0.3577405857740586, + "grad_norm": 0.9457989931106567, + "learning_rate": 4.964356762508447e-06, + "loss": 0.7742, + "step": 969 + }, + { + "epoch": 0.35810977110509473, + "grad_norm": 0.8954206109046936, + "learning_rate": 4.964274878589535e-06, + "loss": 0.7083, + "step": 970 + }, + { + "epoch": 0.3584789564361309, + "grad_norm": 0.9402279853820801, + "learning_rate": 4.964192901398595e-06, + "loss": 0.7566, + "step": 971 + }, + { + "epoch": 0.3588481417671671, + "grad_norm": 0.8966061472892761, + "learning_rate": 4.964110830938734e-06, + "loss": 0.7132, + "step": 972 + }, + { + "epoch": 0.3592173270982033, + "grad_norm": 0.8585612773895264, + "learning_rate": 4.964028667213054e-06, + "loss": 0.7365, + "step": 973 + }, + { + "epoch": 0.3595865124292395, + "grad_norm": 0.8799294233322144, + "learning_rate": 4.9639464102246675e-06, + "loss": 0.7253, + "step": 974 + }, + { + "epoch": 0.35995569776027564, + "grad_norm": 0.9214800000190735, + "learning_rate": 4.963864059976686e-06, + "loss": 0.7317, + "step": 975 + }, + { + "epoch": 0.36032488309131183, + "grad_norm": 0.9081399440765381, + "learning_rate": 4.9637816164722285e-06, + "loss": 0.746, + "step": 976 + }, + { + "epoch": 0.360694068422348, + "grad_norm": 0.9382967948913574, + "learning_rate": 4.963699079714415e-06, + "loss": 0.7606, + "step": 977 + }, + { + "epoch": 0.3610632537533842, + "grad_norm": 0.9069362282752991, + "learning_rate": 4.963616449706367e-06, + "loss": 0.7644, + "step": 978 + }, + { + "epoch": 0.36143243908442035, + "grad_norm": 0.8491265773773193, + "learning_rate": 4.963533726451215e-06, + "loss": 0.738, + "step": 979 + }, + { + "epoch": 0.36180162441545655, + "grad_norm": 0.9470365643501282, + "learning_rate": 4.963450909952089e-06, + "loss": 0.7655, + "step": 980 + }, + { + "epoch": 0.36217080974649274, + "grad_norm": 0.8890638947486877, + "learning_rate": 4.963368000212123e-06, + "loss": 0.7089, + "step": 981 + }, + { + "epoch": 0.36253999507752893, + "grad_norm": 0.9325974583625793, + "learning_rate": 4.963284997234456e-06, + "loss": 0.775, + "step": 982 + }, + { + "epoch": 0.3629091804085651, + "grad_norm": 0.874653160572052, + "learning_rate": 4.96320190102223e-06, + "loss": 0.7289, + "step": 983 + }, + { + "epoch": 0.36327836573960126, + "grad_norm": 0.9684560894966125, + "learning_rate": 4.9631187115785885e-06, + "loss": 0.7216, + "step": 984 + }, + { + "epoch": 0.36364755107063745, + "grad_norm": 0.9118450284004211, + "learning_rate": 4.963035428906681e-06, + "loss": 0.7647, + "step": 985 + }, + { + "epoch": 0.36401673640167365, + "grad_norm": 0.9047130346298218, + "learning_rate": 4.96295205300966e-06, + "loss": 0.748, + "step": 986 + }, + { + "epoch": 0.36438592173270984, + "grad_norm": 0.9111741185188293, + "learning_rate": 4.962868583890682e-06, + "loss": 0.7079, + "step": 987 + }, + { + "epoch": 0.364755107063746, + "grad_norm": 0.9440281391143799, + "learning_rate": 4.962785021552904e-06, + "loss": 0.7544, + "step": 988 + }, + { + "epoch": 0.36512429239478217, + "grad_norm": 0.9372045397758484, + "learning_rate": 4.962701365999491e-06, + "loss": 0.7232, + "step": 989 + }, + { + "epoch": 0.36549347772581836, + "grad_norm": 0.9209040999412537, + "learning_rate": 4.962617617233608e-06, + "loss": 0.7434, + "step": 990 + }, + { + "epoch": 0.36586266305685455, + "grad_norm": 0.9633092284202576, + "learning_rate": 4.962533775258426e-06, + "loss": 0.7531, + "step": 991 + }, + { + "epoch": 0.36623184838789075, + "grad_norm": 0.9297153353691101, + "learning_rate": 4.962449840077118e-06, + "loss": 0.7644, + "step": 992 + }, + { + "epoch": 0.3666010337189269, + "grad_norm": 0.9207053184509277, + "learning_rate": 4.96236581169286e-06, + "loss": 0.7789, + "step": 993 + }, + { + "epoch": 0.3669702190499631, + "grad_norm": 0.9333449602127075, + "learning_rate": 4.962281690108834e-06, + "loss": 0.8054, + "step": 994 + }, + { + "epoch": 0.36733940438099927, + "grad_norm": 0.9394077658653259, + "learning_rate": 4.962197475328222e-06, + "loss": 0.7472, + "step": 995 + }, + { + "epoch": 0.36770858971203546, + "grad_norm": 0.980050802230835, + "learning_rate": 4.962113167354213e-06, + "loss": 0.7505, + "step": 996 + }, + { + "epoch": 0.3680777750430716, + "grad_norm": 0.9034879803657532, + "learning_rate": 4.962028766189999e-06, + "loss": 0.7232, + "step": 997 + }, + { + "epoch": 0.3684469603741078, + "grad_norm": 0.9549526572227478, + "learning_rate": 4.961944271838772e-06, + "loss": 0.7566, + "step": 998 + }, + { + "epoch": 0.368816145705144, + "grad_norm": 0.9430050253868103, + "learning_rate": 4.961859684303731e-06, + "loss": 0.7485, + "step": 999 + }, + { + "epoch": 0.3691853310361802, + "grad_norm": 1.0592334270477295, + "learning_rate": 4.961775003588079e-06, + "loss": 0.7219, + "step": 1000 + }, + { + "epoch": 0.36955451636721637, + "grad_norm": 0.9190243482589722, + "learning_rate": 4.96169022969502e-06, + "loss": 0.7679, + "step": 1001 + }, + { + "epoch": 0.3699237016982525, + "grad_norm": 0.9574081301689148, + "learning_rate": 4.961605362627761e-06, + "loss": 0.7678, + "step": 1002 + }, + { + "epoch": 0.3702928870292887, + "grad_norm": 0.9283050298690796, + "learning_rate": 4.961520402389517e-06, + "loss": 0.7584, + "step": 1003 + }, + { + "epoch": 0.3706620723603249, + "grad_norm": 0.9419953227043152, + "learning_rate": 4.961435348983503e-06, + "loss": 0.7955, + "step": 1004 + }, + { + "epoch": 0.3710312576913611, + "grad_norm": 0.9116794466972351, + "learning_rate": 4.961350202412938e-06, + "loss": 0.7739, + "step": 1005 + }, + { + "epoch": 0.3714004430223972, + "grad_norm": 0.8843052387237549, + "learning_rate": 4.961264962681044e-06, + "loss": 0.6978, + "step": 1006 + }, + { + "epoch": 0.3717696283534334, + "grad_norm": 0.9120619297027588, + "learning_rate": 4.961179629791049e-06, + "loss": 0.7662, + "step": 1007 + }, + { + "epoch": 0.3721388136844696, + "grad_norm": 0.9376091361045837, + "learning_rate": 4.961094203746181e-06, + "loss": 0.7636, + "step": 1008 + }, + { + "epoch": 0.3725079990155058, + "grad_norm": 0.9145896434783936, + "learning_rate": 4.961008684549674e-06, + "loss": 0.7355, + "step": 1009 + }, + { + "epoch": 0.372877184346542, + "grad_norm": 0.8806540966033936, + "learning_rate": 4.960923072204765e-06, + "loss": 0.7709, + "step": 1010 + }, + { + "epoch": 0.3732463696775781, + "grad_norm": 0.8880794048309326, + "learning_rate": 4.9608373667146945e-06, + "loss": 0.7272, + "step": 1011 + }, + { + "epoch": 0.3736155550086143, + "grad_norm": 0.9063923358917236, + "learning_rate": 4.9607515680827065e-06, + "loss": 0.7184, + "step": 1012 + }, + { + "epoch": 0.3739847403396505, + "grad_norm": 0.8815094232559204, + "learning_rate": 4.960665676312047e-06, + "loss": 0.753, + "step": 1013 + }, + { + "epoch": 0.3743539256706867, + "grad_norm": 0.899018406867981, + "learning_rate": 4.96057969140597e-06, + "loss": 0.7388, + "step": 1014 + }, + { + "epoch": 0.37472311100172284, + "grad_norm": 0.9038127064704895, + "learning_rate": 4.960493613367728e-06, + "loss": 0.7131, + "step": 1015 + }, + { + "epoch": 0.37509229633275903, + "grad_norm": 0.890527069568634, + "learning_rate": 4.960407442200579e-06, + "loss": 0.7646, + "step": 1016 + }, + { + "epoch": 0.3754614816637952, + "grad_norm": 0.9003925919532776, + "learning_rate": 4.9603211779077845e-06, + "loss": 0.7285, + "step": 1017 + }, + { + "epoch": 0.3758306669948314, + "grad_norm": 0.9128808975219727, + "learning_rate": 4.96023482049261e-06, + "loss": 0.7608, + "step": 1018 + }, + { + "epoch": 0.3761998523258676, + "grad_norm": 0.9050713181495667, + "learning_rate": 4.960148369958324e-06, + "loss": 0.7375, + "step": 1019 + }, + { + "epoch": 0.37656903765690375, + "grad_norm": 0.9218745827674866, + "learning_rate": 4.960061826308199e-06, + "loss": 0.7222, + "step": 1020 + }, + { + "epoch": 0.37693822298793994, + "grad_norm": 0.9460575580596924, + "learning_rate": 4.95997518954551e-06, + "loss": 0.7733, + "step": 1021 + }, + { + "epoch": 0.37730740831897613, + "grad_norm": 0.9175562262535095, + "learning_rate": 4.959888459673536e-06, + "loss": 0.7418, + "step": 1022 + }, + { + "epoch": 0.3776765936500123, + "grad_norm": 0.9456244707107544, + "learning_rate": 4.959801636695561e-06, + "loss": 0.7552, + "step": 1023 + }, + { + "epoch": 0.37804577898104846, + "grad_norm": 0.8985305428504944, + "learning_rate": 4.959714720614871e-06, + "loss": 0.7366, + "step": 1024 + }, + { + "epoch": 0.37841496431208466, + "grad_norm": 0.9268773794174194, + "learning_rate": 4.959627711434753e-06, + "loss": 0.7408, + "step": 1025 + }, + { + "epoch": 0.37878414964312085, + "grad_norm": 0.9267814755439758, + "learning_rate": 4.959540609158504e-06, + "loss": 0.7589, + "step": 1026 + }, + { + "epoch": 0.37915333497415704, + "grad_norm": 0.905430018901825, + "learning_rate": 4.959453413789419e-06, + "loss": 0.7404, + "step": 1027 + }, + { + "epoch": 0.37952252030519323, + "grad_norm": 0.9021572470664978, + "learning_rate": 4.959366125330798e-06, + "loss": 0.7399, + "step": 1028 + }, + { + "epoch": 0.37989170563622937, + "grad_norm": 0.9395810961723328, + "learning_rate": 4.9592787437859455e-06, + "loss": 0.7259, + "step": 1029 + }, + { + "epoch": 0.38026089096726556, + "grad_norm": 0.8881012201309204, + "learning_rate": 4.959191269158169e-06, + "loss": 0.7365, + "step": 1030 + }, + { + "epoch": 0.38063007629830176, + "grad_norm": 0.9610297679901123, + "learning_rate": 4.959103701450779e-06, + "loss": 0.7141, + "step": 1031 + }, + { + "epoch": 0.38099926162933795, + "grad_norm": 0.8691598176956177, + "learning_rate": 4.959016040667089e-06, + "loss": 0.725, + "step": 1032 + }, + { + "epoch": 0.3813684469603741, + "grad_norm": 0.912972092628479, + "learning_rate": 4.9589282868104195e-06, + "loss": 0.6629, + "step": 1033 + }, + { + "epoch": 0.3817376322914103, + "grad_norm": 0.9238312840461731, + "learning_rate": 4.95884043988409e-06, + "loss": 0.7799, + "step": 1034 + }, + { + "epoch": 0.38210681762244647, + "grad_norm": 0.8956131935119629, + "learning_rate": 4.9587524998914255e-06, + "loss": 0.759, + "step": 1035 + }, + { + "epoch": 0.38247600295348266, + "grad_norm": 0.8826711177825928, + "learning_rate": 4.958664466835756e-06, + "loss": 0.7486, + "step": 1036 + }, + { + "epoch": 0.38284518828451886, + "grad_norm": 0.9134225845336914, + "learning_rate": 4.95857634072041e-06, + "loss": 0.7608, + "step": 1037 + }, + { + "epoch": 0.383214373615555, + "grad_norm": 0.9029486179351807, + "learning_rate": 4.958488121548727e-06, + "loss": 0.7254, + "step": 1038 + }, + { + "epoch": 0.3835835589465912, + "grad_norm": 0.8955227732658386, + "learning_rate": 4.958399809324045e-06, + "loss": 0.7928, + "step": 1039 + }, + { + "epoch": 0.3839527442776274, + "grad_norm": 0.8920780420303345, + "learning_rate": 4.958311404049705e-06, + "loss": 0.7654, + "step": 1040 + }, + { + "epoch": 0.38432192960866357, + "grad_norm": 0.9059119820594788, + "learning_rate": 4.958222905729055e-06, + "loss": 0.7263, + "step": 1041 + }, + { + "epoch": 0.3846911149396997, + "grad_norm": 0.8846084475517273, + "learning_rate": 4.958134314365443e-06, + "loss": 0.7119, + "step": 1042 + }, + { + "epoch": 0.3850603002707359, + "grad_norm": 0.8834053874015808, + "learning_rate": 4.9580456299622235e-06, + "loss": 0.7113, + "step": 1043 + }, + { + "epoch": 0.3854294856017721, + "grad_norm": 0.9049264192581177, + "learning_rate": 4.957956852522753e-06, + "loss": 0.7233, + "step": 1044 + }, + { + "epoch": 0.3857986709328083, + "grad_norm": 0.8891218900680542, + "learning_rate": 4.9578679820503905e-06, + "loss": 0.6903, + "step": 1045 + }, + { + "epoch": 0.3861678562638444, + "grad_norm": 0.8697208166122437, + "learning_rate": 4.957779018548501e-06, + "loss": 0.748, + "step": 1046 + }, + { + "epoch": 0.3865370415948806, + "grad_norm": 0.9173257946968079, + "learning_rate": 4.957689962020452e-06, + "loss": 0.762, + "step": 1047 + }, + { + "epoch": 0.3869062269259168, + "grad_norm": 0.9417243003845215, + "learning_rate": 4.957600812469613e-06, + "loss": 0.7513, + "step": 1048 + }, + { + "epoch": 0.387275412256953, + "grad_norm": 0.9135996103286743, + "learning_rate": 4.95751156989936e-06, + "loss": 0.7303, + "step": 1049 + }, + { + "epoch": 0.3876445975879892, + "grad_norm": 0.8908846378326416, + "learning_rate": 4.957422234313068e-06, + "loss": 0.7606, + "step": 1050 + }, + { + "epoch": 0.38801378291902533, + "grad_norm": 0.8868354558944702, + "learning_rate": 4.95733280571412e-06, + "loss": 0.7219, + "step": 1051 + }, + { + "epoch": 0.3883829682500615, + "grad_norm": 0.9057561755180359, + "learning_rate": 4.957243284105902e-06, + "loss": 0.7392, + "step": 1052 + }, + { + "epoch": 0.3887521535810977, + "grad_norm": 0.9165422916412354, + "learning_rate": 4.9571536694918e-06, + "loss": 0.7682, + "step": 1053 + }, + { + "epoch": 0.3891213389121339, + "grad_norm": 0.9119623899459839, + "learning_rate": 4.957063961875208e-06, + "loss": 0.714, + "step": 1054 + }, + { + "epoch": 0.38949052424317004, + "grad_norm": 0.9237239956855774, + "learning_rate": 4.95697416125952e-06, + "loss": 0.7394, + "step": 1055 + }, + { + "epoch": 0.38985970957420624, + "grad_norm": 0.8836085796356201, + "learning_rate": 4.956884267648136e-06, + "loss": 0.7101, + "step": 1056 + }, + { + "epoch": 0.39022889490524243, + "grad_norm": 0.8749224543571472, + "learning_rate": 4.956794281044458e-06, + "loss": 0.7181, + "step": 1057 + }, + { + "epoch": 0.3905980802362786, + "grad_norm": 0.9028134942054749, + "learning_rate": 4.956704201451891e-06, + "loss": 0.7268, + "step": 1058 + }, + { + "epoch": 0.3909672655673148, + "grad_norm": 0.8953654170036316, + "learning_rate": 4.956614028873846e-06, + "loss": 0.7691, + "step": 1059 + }, + { + "epoch": 0.39133645089835095, + "grad_norm": 0.881486713886261, + "learning_rate": 4.956523763313736e-06, + "loss": 0.7499, + "step": 1060 + }, + { + "epoch": 0.39170563622938714, + "grad_norm": 0.8811509609222412, + "learning_rate": 4.956433404774975e-06, + "loss": 0.718, + "step": 1061 + }, + { + "epoch": 0.39207482156042334, + "grad_norm": 0.9057355523109436, + "learning_rate": 4.956342953260986e-06, + "loss": 0.7533, + "step": 1062 + }, + { + "epoch": 0.39244400689145953, + "grad_norm": 0.9343195557594299, + "learning_rate": 4.956252408775191e-06, + "loss": 0.7808, + "step": 1063 + }, + { + "epoch": 0.39281319222249567, + "grad_norm": 0.8957801461219788, + "learning_rate": 4.9561617713210174e-06, + "loss": 0.7304, + "step": 1064 + }, + { + "epoch": 0.39318237755353186, + "grad_norm": 0.8987041115760803, + "learning_rate": 4.956071040901897e-06, + "loss": 0.745, + "step": 1065 + }, + { + "epoch": 0.39355156288456805, + "grad_norm": 0.9891381859779358, + "learning_rate": 4.955980217521263e-06, + "loss": 0.761, + "step": 1066 + }, + { + "epoch": 0.39392074821560424, + "grad_norm": 0.9239757061004639, + "learning_rate": 4.955889301182551e-06, + "loss": 0.7811, + "step": 1067 + }, + { + "epoch": 0.39428993354664044, + "grad_norm": 0.9178396463394165, + "learning_rate": 4.955798291889205e-06, + "loss": 0.7447, + "step": 1068 + }, + { + "epoch": 0.3946591188776766, + "grad_norm": 0.927836537361145, + "learning_rate": 4.955707189644669e-06, + "loss": 0.7527, + "step": 1069 + }, + { + "epoch": 0.39502830420871277, + "grad_norm": 0.9121710658073425, + "learning_rate": 4.955615994452391e-06, + "loss": 0.7283, + "step": 1070 + }, + { + "epoch": 0.39539748953974896, + "grad_norm": 0.8991080522537231, + "learning_rate": 4.955524706315822e-06, + "loss": 0.7521, + "step": 1071 + }, + { + "epoch": 0.39576667487078515, + "grad_norm": 0.9071163535118103, + "learning_rate": 4.955433325238418e-06, + "loss": 0.7227, + "step": 1072 + }, + { + "epoch": 0.3961358602018213, + "grad_norm": 0.9359886646270752, + "learning_rate": 4.955341851223639e-06, + "loss": 0.7888, + "step": 1073 + }, + { + "epoch": 0.3965050455328575, + "grad_norm": 0.9484068155288696, + "learning_rate": 4.955250284274944e-06, + "loss": 0.7609, + "step": 1074 + }, + { + "epoch": 0.3968742308638937, + "grad_norm": 0.9310367107391357, + "learning_rate": 4.9551586243958e-06, + "loss": 0.7698, + "step": 1075 + }, + { + "epoch": 0.39724341619492987, + "grad_norm": 0.9183504581451416, + "learning_rate": 4.955066871589679e-06, + "loss": 0.7442, + "step": 1076 + }, + { + "epoch": 0.39761260152596606, + "grad_norm": 0.9239982962608337, + "learning_rate": 4.954975025860051e-06, + "loss": 0.7485, + "step": 1077 + }, + { + "epoch": 0.3979817868570022, + "grad_norm": 0.9956908226013184, + "learning_rate": 4.954883087210393e-06, + "loss": 0.7817, + "step": 1078 + }, + { + "epoch": 0.3983509721880384, + "grad_norm": 0.897830069065094, + "learning_rate": 4.9547910556441845e-06, + "loss": 0.7241, + "step": 1079 + }, + { + "epoch": 0.3987201575190746, + "grad_norm": 0.8917036652565002, + "learning_rate": 4.95469893116491e-06, + "loss": 0.7047, + "step": 1080 + }, + { + "epoch": 0.3990893428501108, + "grad_norm": 0.8943704962730408, + "learning_rate": 4.954606713776056e-06, + "loss": 0.7071, + "step": 1081 + }, + { + "epoch": 0.3994585281811469, + "grad_norm": 0.8890754580497742, + "learning_rate": 4.954514403481112e-06, + "loss": 0.7295, + "step": 1082 + }, + { + "epoch": 0.3998277135121831, + "grad_norm": 0.9230964779853821, + "learning_rate": 4.954422000283572e-06, + "loss": 0.7491, + "step": 1083 + }, + { + "epoch": 0.4001968988432193, + "grad_norm": 0.9171218276023865, + "learning_rate": 4.954329504186935e-06, + "loss": 0.7463, + "step": 1084 + }, + { + "epoch": 0.4005660841742555, + "grad_norm": 0.8871411085128784, + "learning_rate": 4.954236915194699e-06, + "loss": 0.7312, + "step": 1085 + }, + { + "epoch": 0.4009352695052917, + "grad_norm": 0.9313485026359558, + "learning_rate": 4.954144233310372e-06, + "loss": 0.7129, + "step": 1086 + }, + { + "epoch": 0.4013044548363278, + "grad_norm": 0.9138079881668091, + "learning_rate": 4.95405145853746e-06, + "loss": 0.746, + "step": 1087 + }, + { + "epoch": 0.401673640167364, + "grad_norm": 0.9450967907905579, + "learning_rate": 4.9539585908794746e-06, + "loss": 0.7811, + "step": 1088 + }, + { + "epoch": 0.4020428254984002, + "grad_norm": 0.9284427762031555, + "learning_rate": 4.9538656303399314e-06, + "loss": 0.74, + "step": 1089 + }, + { + "epoch": 0.4024120108294364, + "grad_norm": 0.9271199107170105, + "learning_rate": 4.953772576922348e-06, + "loss": 0.7622, + "step": 1090 + }, + { + "epoch": 0.40278119616047253, + "grad_norm": 0.8856538534164429, + "learning_rate": 4.953679430630247e-06, + "loss": 0.6994, + "step": 1091 + }, + { + "epoch": 0.4031503814915087, + "grad_norm": 0.9314479231834412, + "learning_rate": 4.953586191467155e-06, + "loss": 0.7583, + "step": 1092 + }, + { + "epoch": 0.4035195668225449, + "grad_norm": 0.9417778253555298, + "learning_rate": 4.953492859436599e-06, + "loss": 0.7601, + "step": 1093 + }, + { + "epoch": 0.4038887521535811, + "grad_norm": 0.8887828588485718, + "learning_rate": 4.953399434542112e-06, + "loss": 0.744, + "step": 1094 + }, + { + "epoch": 0.4042579374846173, + "grad_norm": 0.926156759262085, + "learning_rate": 4.953305916787232e-06, + "loss": 0.7113, + "step": 1095 + }, + { + "epoch": 0.40462712281565344, + "grad_norm": 0.9664729833602905, + "learning_rate": 4.9532123061754966e-06, + "loss": 0.733, + "step": 1096 + }, + { + "epoch": 0.40499630814668963, + "grad_norm": 0.9112409353256226, + "learning_rate": 4.953118602710449e-06, + "loss": 0.764, + "step": 1097 + }, + { + "epoch": 0.4053654934777258, + "grad_norm": 0.9250743389129639, + "learning_rate": 4.9530248063956375e-06, + "loss": 0.7536, + "step": 1098 + }, + { + "epoch": 0.405734678808762, + "grad_norm": 0.8889137506484985, + "learning_rate": 4.952930917234612e-06, + "loss": 0.7208, + "step": 1099 + }, + { + "epoch": 0.40610386413979815, + "grad_norm": 0.9497808814048767, + "learning_rate": 4.952836935230924e-06, + "loss": 0.7263, + "step": 1100 + }, + { + "epoch": 0.40647304947083435, + "grad_norm": 0.9288194179534912, + "learning_rate": 4.952742860388133e-06, + "loss": 0.7321, + "step": 1101 + }, + { + "epoch": 0.40684223480187054, + "grad_norm": 0.9409608244895935, + "learning_rate": 4.952648692709798e-06, + "loss": 0.7375, + "step": 1102 + }, + { + "epoch": 0.40721142013290673, + "grad_norm": 0.8995351195335388, + "learning_rate": 4.952554432199485e-06, + "loss": 0.7626, + "step": 1103 + }, + { + "epoch": 0.4075806054639429, + "grad_norm": 0.9207445979118347, + "learning_rate": 4.95246007886076e-06, + "loss": 0.7324, + "step": 1104 + }, + { + "epoch": 0.40794979079497906, + "grad_norm": 0.9042755365371704, + "learning_rate": 4.9523656326971954e-06, + "loss": 0.7281, + "step": 1105 + }, + { + "epoch": 0.40831897612601525, + "grad_norm": 0.9243267178535461, + "learning_rate": 4.952271093712366e-06, + "loss": 0.7536, + "step": 1106 + }, + { + "epoch": 0.40868816145705145, + "grad_norm": 0.9243746399879456, + "learning_rate": 4.952176461909849e-06, + "loss": 0.7514, + "step": 1107 + }, + { + "epoch": 0.40905734678808764, + "grad_norm": 0.9338531494140625, + "learning_rate": 4.952081737293227e-06, + "loss": 0.7443, + "step": 1108 + }, + { + "epoch": 0.4094265321191238, + "grad_norm": 0.9238013625144958, + "learning_rate": 4.951986919866085e-06, + "loss": 0.6872, + "step": 1109 + }, + { + "epoch": 0.40979571745015997, + "grad_norm": 0.9439871907234192, + "learning_rate": 4.951892009632012e-06, + "loss": 0.7577, + "step": 1110 + }, + { + "epoch": 0.41016490278119616, + "grad_norm": 0.9026377201080322, + "learning_rate": 4.951797006594601e-06, + "loss": 0.7025, + "step": 1111 + }, + { + "epoch": 0.41053408811223235, + "grad_norm": 0.9088364839553833, + "learning_rate": 4.951701910757446e-06, + "loss": 0.7286, + "step": 1112 + }, + { + "epoch": 0.41090327344326855, + "grad_norm": 0.9279249906539917, + "learning_rate": 4.9516067221241485e-06, + "loss": 0.7533, + "step": 1113 + }, + { + "epoch": 0.4112724587743047, + "grad_norm": 0.8800785541534424, + "learning_rate": 4.951511440698309e-06, + "loss": 0.7352, + "step": 1114 + }, + { + "epoch": 0.4116416441053409, + "grad_norm": 0.8675339818000793, + "learning_rate": 4.9514160664835366e-06, + "loss": 0.7565, + "step": 1115 + }, + { + "epoch": 0.41201082943637707, + "grad_norm": 0.9367777109146118, + "learning_rate": 4.951320599483439e-06, + "loss": 0.7501, + "step": 1116 + }, + { + "epoch": 0.41238001476741326, + "grad_norm": 0.9239450693130493, + "learning_rate": 4.9512250397016304e-06, + "loss": 0.7774, + "step": 1117 + }, + { + "epoch": 0.4127492000984494, + "grad_norm": 0.8786678314208984, + "learning_rate": 4.951129387141728e-06, + "loss": 0.7385, + "step": 1118 + }, + { + "epoch": 0.4131183854294856, + "grad_norm": 0.9262030720710754, + "learning_rate": 4.951033641807351e-06, + "loss": 0.7653, + "step": 1119 + }, + { + "epoch": 0.4134875707605218, + "grad_norm": 0.901329755783081, + "learning_rate": 4.950937803702125e-06, + "loss": 0.7375, + "step": 1120 + }, + { + "epoch": 0.413856756091558, + "grad_norm": 0.9039179086685181, + "learning_rate": 4.950841872829676e-06, + "loss": 0.7644, + "step": 1121 + }, + { + "epoch": 0.41422594142259417, + "grad_norm": 0.9563358426094055, + "learning_rate": 4.9507458491936365e-06, + "loss": 0.7476, + "step": 1122 + }, + { + "epoch": 0.4145951267536303, + "grad_norm": 0.9279500842094421, + "learning_rate": 4.950649732797639e-06, + "loss": 0.723, + "step": 1123 + }, + { + "epoch": 0.4149643120846665, + "grad_norm": 0.9395243525505066, + "learning_rate": 4.950553523645324e-06, + "loss": 0.7341, + "step": 1124 + }, + { + "epoch": 0.4153334974157027, + "grad_norm": 0.9069976210594177, + "learning_rate": 4.9504572217403305e-06, + "loss": 0.7458, + "step": 1125 + }, + { + "epoch": 0.4157026827467389, + "grad_norm": 0.9429416656494141, + "learning_rate": 4.9503608270863046e-06, + "loss": 0.7534, + "step": 1126 + }, + { + "epoch": 0.416071868077775, + "grad_norm": 0.9167525172233582, + "learning_rate": 4.950264339686895e-06, + "loss": 0.7223, + "step": 1127 + }, + { + "epoch": 0.4164410534088112, + "grad_norm": 0.8981575965881348, + "learning_rate": 4.950167759545753e-06, + "loss": 0.7562, + "step": 1128 + }, + { + "epoch": 0.4168102387398474, + "grad_norm": 0.8772927522659302, + "learning_rate": 4.950071086666535e-06, + "loss": 0.7528, + "step": 1129 + }, + { + "epoch": 0.4171794240708836, + "grad_norm": 0.8946474194526672, + "learning_rate": 4.949974321052899e-06, + "loss": 0.7232, + "step": 1130 + }, + { + "epoch": 0.4175486094019198, + "grad_norm": 0.9215155839920044, + "learning_rate": 4.94987746270851e-06, + "loss": 0.7705, + "step": 1131 + }, + { + "epoch": 0.4179177947329559, + "grad_norm": 0.8860598802566528, + "learning_rate": 4.94978051163703e-06, + "loss": 0.7468, + "step": 1132 + }, + { + "epoch": 0.4182869800639921, + "grad_norm": 0.9385093450546265, + "learning_rate": 4.9496834678421325e-06, + "loss": 0.7881, + "step": 1133 + }, + { + "epoch": 0.4186561653950283, + "grad_norm": 0.9394497275352478, + "learning_rate": 4.949586331327488e-06, + "loss": 0.7543, + "step": 1134 + }, + { + "epoch": 0.4190253507260645, + "grad_norm": 0.9088026881217957, + "learning_rate": 4.949489102096774e-06, + "loss": 0.7598, + "step": 1135 + }, + { + "epoch": 0.41939453605710064, + "grad_norm": 0.9452118873596191, + "learning_rate": 4.94939178015367e-06, + "loss": 0.7462, + "step": 1136 + }, + { + "epoch": 0.41976372138813683, + "grad_norm": 0.9107792377471924, + "learning_rate": 4.949294365501862e-06, + "loss": 0.7423, + "step": 1137 + }, + { + "epoch": 0.420132906719173, + "grad_norm": 0.8913663625717163, + "learning_rate": 4.9491968581450334e-06, + "loss": 0.7334, + "step": 1138 + }, + { + "epoch": 0.4205020920502092, + "grad_norm": 0.902692973613739, + "learning_rate": 4.949099258086878e-06, + "loss": 0.701, + "step": 1139 + }, + { + "epoch": 0.4208712773812454, + "grad_norm": 0.8897360563278198, + "learning_rate": 4.949001565331087e-06, + "loss": 0.7425, + "step": 1140 + }, + { + "epoch": 0.42124046271228155, + "grad_norm": 0.9004592299461365, + "learning_rate": 4.948903779881361e-06, + "loss": 0.7052, + "step": 1141 + }, + { + "epoch": 0.42160964804331774, + "grad_norm": 0.8605636954307556, + "learning_rate": 4.9488059017413995e-06, + "loss": 0.7199, + "step": 1142 + }, + { + "epoch": 0.42197883337435393, + "grad_norm": 0.9023451805114746, + "learning_rate": 4.948707930914908e-06, + "loss": 0.7718, + "step": 1143 + }, + { + "epoch": 0.4223480187053901, + "grad_norm": 0.9158706068992615, + "learning_rate": 4.948609867405594e-06, + "loss": 0.759, + "step": 1144 + }, + { + "epoch": 0.42271720403642626, + "grad_norm": 0.914189338684082, + "learning_rate": 4.948511711217168e-06, + "loss": 0.7292, + "step": 1145 + }, + { + "epoch": 0.42308638936746246, + "grad_norm": 0.9431989789009094, + "learning_rate": 4.948413462353347e-06, + "loss": 0.7385, + "step": 1146 + }, + { + "epoch": 0.42345557469849865, + "grad_norm": 0.9301594495773315, + "learning_rate": 4.9483151208178505e-06, + "loss": 0.7613, + "step": 1147 + }, + { + "epoch": 0.42382476002953484, + "grad_norm": 0.9736144542694092, + "learning_rate": 4.948216686614398e-06, + "loss": 0.7694, + "step": 1148 + }, + { + "epoch": 0.42419394536057103, + "grad_norm": 0.9035144448280334, + "learning_rate": 4.948118159746718e-06, + "loss": 0.7107, + "step": 1149 + }, + { + "epoch": 0.42456313069160717, + "grad_norm": 0.9065275192260742, + "learning_rate": 4.948019540218536e-06, + "loss": 0.7772, + "step": 1150 + }, + { + "epoch": 0.42493231602264336, + "grad_norm": 0.920475959777832, + "learning_rate": 4.9479208280335885e-06, + "loss": 0.7342, + "step": 1151 + }, + { + "epoch": 0.42530150135367956, + "grad_norm": 0.9290496706962585, + "learning_rate": 4.947822023195611e-06, + "loss": 0.7331, + "step": 1152 + }, + { + "epoch": 0.42567068668471575, + "grad_norm": 0.8922300934791565, + "learning_rate": 4.9477231257083415e-06, + "loss": 0.7655, + "step": 1153 + }, + { + "epoch": 0.4260398720157519, + "grad_norm": 0.9539517164230347, + "learning_rate": 4.947624135575524e-06, + "loss": 0.7567, + "step": 1154 + }, + { + "epoch": 0.4264090573467881, + "grad_norm": 0.8572643995285034, + "learning_rate": 4.9475250528009055e-06, + "loss": 0.6853, + "step": 1155 + }, + { + "epoch": 0.42677824267782427, + "grad_norm": 1.0646028518676758, + "learning_rate": 4.947425877388237e-06, + "loss": 0.7308, + "step": 1156 + }, + { + "epoch": 0.42714742800886046, + "grad_norm": 0.9418565630912781, + "learning_rate": 4.947326609341271e-06, + "loss": 0.7319, + "step": 1157 + }, + { + "epoch": 0.4275166133398966, + "grad_norm": 0.9165734052658081, + "learning_rate": 4.947227248663764e-06, + "loss": 0.7168, + "step": 1158 + }, + { + "epoch": 0.4278857986709328, + "grad_norm": 0.9139310717582703, + "learning_rate": 4.94712779535948e-06, + "loss": 0.7387, + "step": 1159 + }, + { + "epoch": 0.428254984001969, + "grad_norm": 0.8762199282646179, + "learning_rate": 4.94702824943218e-06, + "loss": 0.6796, + "step": 1160 + }, + { + "epoch": 0.4286241693330052, + "grad_norm": 0.9274040460586548, + "learning_rate": 4.946928610885633e-06, + "loss": 0.7424, + "step": 1161 + }, + { + "epoch": 0.42899335466404137, + "grad_norm": 0.9182401299476624, + "learning_rate": 4.946828879723611e-06, + "loss": 0.7242, + "step": 1162 + }, + { + "epoch": 0.4293625399950775, + "grad_norm": 0.9233170747756958, + "learning_rate": 4.946729055949888e-06, + "loss": 0.7557, + "step": 1163 + }, + { + "epoch": 0.4297317253261137, + "grad_norm": 0.9127527475357056, + "learning_rate": 4.946629139568242e-06, + "loss": 0.754, + "step": 1164 + }, + { + "epoch": 0.4301009106571499, + "grad_norm": 0.9096380472183228, + "learning_rate": 4.946529130582456e-06, + "loss": 0.7747, + "step": 1165 + }, + { + "epoch": 0.4304700959881861, + "grad_norm": 0.927699089050293, + "learning_rate": 4.946429028996314e-06, + "loss": 0.7741, + "step": 1166 + }, + { + "epoch": 0.4308392813192222, + "grad_norm": 0.8996379971504211, + "learning_rate": 4.946328834813605e-06, + "loss": 0.7582, + "step": 1167 + }, + { + "epoch": 0.4312084666502584, + "grad_norm": 0.9378359317779541, + "learning_rate": 4.946228548038122e-06, + "loss": 0.7873, + "step": 1168 + }, + { + "epoch": 0.4315776519812946, + "grad_norm": 0.85906583070755, + "learning_rate": 4.946128168673662e-06, + "loss": 0.6732, + "step": 1169 + }, + { + "epoch": 0.4319468373123308, + "grad_norm": 0.8577884435653687, + "learning_rate": 4.94602769672402e-06, + "loss": 0.705, + "step": 1170 + }, + { + "epoch": 0.432316022643367, + "grad_norm": 0.8988800048828125, + "learning_rate": 4.945927132193003e-06, + "loss": 0.7255, + "step": 1171 + }, + { + "epoch": 0.43268520797440313, + "grad_norm": 0.8887507319450378, + "learning_rate": 4.945826475084417e-06, + "loss": 0.7122, + "step": 1172 + }, + { + "epoch": 0.4330543933054393, + "grad_norm": 0.9307096004486084, + "learning_rate": 4.9457257254020696e-06, + "loss": 0.7394, + "step": 1173 + }, + { + "epoch": 0.4334235786364755, + "grad_norm": 0.9346253871917725, + "learning_rate": 4.945624883149776e-06, + "loss": 0.7617, + "step": 1174 + }, + { + "epoch": 0.4337927639675117, + "grad_norm": 0.935279905796051, + "learning_rate": 4.945523948331352e-06, + "loss": 0.72, + "step": 1175 + }, + { + "epoch": 0.43416194929854784, + "grad_norm": 0.9695756435394287, + "learning_rate": 4.9454229209506186e-06, + "loss": 0.7499, + "step": 1176 + }, + { + "epoch": 0.43453113462958404, + "grad_norm": 0.8869019150733948, + "learning_rate": 4.9453218010114e-06, + "loss": 0.6932, + "step": 1177 + }, + { + "epoch": 0.43490031996062023, + "grad_norm": 0.9020052552223206, + "learning_rate": 4.945220588517522e-06, + "loss": 0.7313, + "step": 1178 + }, + { + "epoch": 0.4352695052916564, + "grad_norm": 0.9483025670051575, + "learning_rate": 4.945119283472816e-06, + "loss": 0.7311, + "step": 1179 + }, + { + "epoch": 0.4356386906226926, + "grad_norm": 0.8632071614265442, + "learning_rate": 4.945017885881118e-06, + "loss": 0.6701, + "step": 1180 + }, + { + "epoch": 0.43600787595372875, + "grad_norm": 0.8918522000312805, + "learning_rate": 4.944916395746264e-06, + "loss": 0.6994, + "step": 1181 + }, + { + "epoch": 0.43637706128476494, + "grad_norm": 0.9055469036102295, + "learning_rate": 4.944814813072097e-06, + "loss": 0.7532, + "step": 1182 + }, + { + "epoch": 0.43674624661580114, + "grad_norm": 0.9149122834205627, + "learning_rate": 4.94471313786246e-06, + "loss": 0.7713, + "step": 1183 + }, + { + "epoch": 0.43711543194683733, + "grad_norm": 0.9110752940177917, + "learning_rate": 4.944611370121203e-06, + "loss": 0.7157, + "step": 1184 + }, + { + "epoch": 0.43748461727787347, + "grad_norm": 0.9197525382041931, + "learning_rate": 4.9445095098521765e-06, + "loss": 0.7595, + "step": 1185 + }, + { + "epoch": 0.43785380260890966, + "grad_norm": 0.9142046570777893, + "learning_rate": 4.944407557059236e-06, + "loss": 0.6909, + "step": 1186 + }, + { + "epoch": 0.43822298793994585, + "grad_norm": 0.8837047219276428, + "learning_rate": 4.944305511746242e-06, + "loss": 0.7213, + "step": 1187 + }, + { + "epoch": 0.43859217327098204, + "grad_norm": 0.9123366475105286, + "learning_rate": 4.944203373917056e-06, + "loss": 0.7507, + "step": 1188 + }, + { + "epoch": 0.43896135860201824, + "grad_norm": 0.9120188355445862, + "learning_rate": 4.944101143575542e-06, + "loss": 0.7194, + "step": 1189 + }, + { + "epoch": 0.4393305439330544, + "grad_norm": 0.9454036951065063, + "learning_rate": 4.943998820725573e-06, + "loss": 0.7523, + "step": 1190 + }, + { + "epoch": 0.43969972926409057, + "grad_norm": 0.9568728804588318, + "learning_rate": 4.943896405371019e-06, + "loss": 0.7193, + "step": 1191 + }, + { + "epoch": 0.44006891459512676, + "grad_norm": 0.9208801984786987, + "learning_rate": 4.9437938975157586e-06, + "loss": 0.7172, + "step": 1192 + }, + { + "epoch": 0.44043809992616295, + "grad_norm": 0.9229491353034973, + "learning_rate": 4.9436912971636695e-06, + "loss": 0.738, + "step": 1193 + }, + { + "epoch": 0.4408072852571991, + "grad_norm": 0.9245941042900085, + "learning_rate": 4.943588604318635e-06, + "loss": 0.7437, + "step": 1194 + }, + { + "epoch": 0.4411764705882353, + "grad_norm": 0.8792277574539185, + "learning_rate": 4.943485818984545e-06, + "loss": 0.7363, + "step": 1195 + }, + { + "epoch": 0.4415456559192715, + "grad_norm": 0.9498505592346191, + "learning_rate": 4.9433829411652864e-06, + "loss": 0.7757, + "step": 1196 + }, + { + "epoch": 0.44191484125030767, + "grad_norm": 0.9258801341056824, + "learning_rate": 4.943279970864755e-06, + "loss": 0.7355, + "step": 1197 + }, + { + "epoch": 0.44228402658134386, + "grad_norm": 0.9117864370346069, + "learning_rate": 4.943176908086849e-06, + "loss": 0.7208, + "step": 1198 + }, + { + "epoch": 0.44265321191238, + "grad_norm": 0.8816313743591309, + "learning_rate": 4.9430737528354665e-06, + "loss": 0.6972, + "step": 1199 + }, + { + "epoch": 0.4430223972434162, + "grad_norm": 0.9166438579559326, + "learning_rate": 4.942970505114514e-06, + "loss": 0.76, + "step": 1200 + }, + { + "epoch": 0.4433915825744524, + "grad_norm": 0.8871222138404846, + "learning_rate": 4.942867164927899e-06, + "loss": 0.7348, + "step": 1201 + }, + { + "epoch": 0.4437607679054886, + "grad_norm": 0.9092727303504944, + "learning_rate": 4.942763732279533e-06, + "loss": 0.7114, + "step": 1202 + }, + { + "epoch": 0.4441299532365247, + "grad_norm": 0.9247320294380188, + "learning_rate": 4.94266020717333e-06, + "loss": 0.756, + "step": 1203 + }, + { + "epoch": 0.4444991385675609, + "grad_norm": 0.9279753565788269, + "learning_rate": 4.94255658961321e-06, + "loss": 0.7269, + "step": 1204 + }, + { + "epoch": 0.4448683238985971, + "grad_norm": 0.917389452457428, + "learning_rate": 4.942452879603094e-06, + "loss": 0.7285, + "step": 1205 + }, + { + "epoch": 0.4452375092296333, + "grad_norm": 0.8783107399940491, + "learning_rate": 4.942349077146906e-06, + "loss": 0.7487, + "step": 1206 + }, + { + "epoch": 0.4456066945606695, + "grad_norm": 0.9037907719612122, + "learning_rate": 4.9422451822485776e-06, + "loss": 0.7436, + "step": 1207 + }, + { + "epoch": 0.4459758798917056, + "grad_norm": 0.8458569049835205, + "learning_rate": 4.942141194912039e-06, + "loss": 0.6799, + "step": 1208 + }, + { + "epoch": 0.4463450652227418, + "grad_norm": 0.882938802242279, + "learning_rate": 4.942037115141228e-06, + "loss": 0.7216, + "step": 1209 + }, + { + "epoch": 0.446714250553778, + "grad_norm": 0.920384407043457, + "learning_rate": 4.9419329429400816e-06, + "loss": 0.7292, + "step": 1210 + }, + { + "epoch": 0.4470834358848142, + "grad_norm": 0.9709598422050476, + "learning_rate": 4.941828678312545e-06, + "loss": 0.7588, + "step": 1211 + }, + { + "epoch": 0.44745262121585033, + "grad_norm": 0.883941650390625, + "learning_rate": 4.941724321262563e-06, + "loss": 0.7007, + "step": 1212 + }, + { + "epoch": 0.4478218065468865, + "grad_norm": 0.9086169004440308, + "learning_rate": 4.941619871794087e-06, + "loss": 0.73, + "step": 1213 + }, + { + "epoch": 0.4481909918779227, + "grad_norm": 0.8642125129699707, + "learning_rate": 4.941515329911068e-06, + "loss": 0.6792, + "step": 1214 + }, + { + "epoch": 0.4485601772089589, + "grad_norm": 0.9487695693969727, + "learning_rate": 4.941410695617464e-06, + "loss": 0.7419, + "step": 1215 + }, + { + "epoch": 0.4489293625399951, + "grad_norm": 0.994884729385376, + "learning_rate": 4.941305968917238e-06, + "loss": 0.6888, + "step": 1216 + }, + { + "epoch": 0.44929854787103124, + "grad_norm": 0.9372398853302002, + "learning_rate": 4.941201149814349e-06, + "loss": 0.7325, + "step": 1217 + }, + { + "epoch": 0.44966773320206743, + "grad_norm": 0.9120617508888245, + "learning_rate": 4.94109623831277e-06, + "loss": 0.7697, + "step": 1218 + }, + { + "epoch": 0.4500369185331036, + "grad_norm": 0.9304324388504028, + "learning_rate": 4.940991234416466e-06, + "loss": 0.74, + "step": 1219 + }, + { + "epoch": 0.4504061038641398, + "grad_norm": 0.8959391713142395, + "learning_rate": 4.940886138129415e-06, + "loss": 0.7074, + "step": 1220 + }, + { + "epoch": 0.45077528919517595, + "grad_norm": 0.9347814917564392, + "learning_rate": 4.940780949455595e-06, + "loss": 0.7111, + "step": 1221 + }, + { + "epoch": 0.45114447452621215, + "grad_norm": 0.8901122212409973, + "learning_rate": 4.940675668398986e-06, + "loss": 0.74, + "step": 1222 + }, + { + "epoch": 0.45151365985724834, + "grad_norm": 0.9231247305870056, + "learning_rate": 4.940570294963572e-06, + "loss": 0.7486, + "step": 1223 + }, + { + "epoch": 0.45188284518828453, + "grad_norm": 0.9006732106208801, + "learning_rate": 4.940464829153343e-06, + "loss": 0.7592, + "step": 1224 + }, + { + "epoch": 0.4522520305193207, + "grad_norm": 0.8815886974334717, + "learning_rate": 4.940359270972291e-06, + "loss": 0.6919, + "step": 1225 + }, + { + "epoch": 0.45262121585035686, + "grad_norm": 0.8991600275039673, + "learning_rate": 4.940253620424411e-06, + "loss": 0.7178, + "step": 1226 + }, + { + "epoch": 0.45299040118139305, + "grad_norm": 0.8781217932701111, + "learning_rate": 4.940147877513701e-06, + "loss": 0.722, + "step": 1227 + }, + { + "epoch": 0.45335958651242925, + "grad_norm": 0.9302307367324829, + "learning_rate": 4.940042042244164e-06, + "loss": 0.7535, + "step": 1228 + }, + { + "epoch": 0.45372877184346544, + "grad_norm": 0.9426242113113403, + "learning_rate": 4.9399361146198065e-06, + "loss": 0.7451, + "step": 1229 + }, + { + "epoch": 0.4540979571745016, + "grad_norm": 0.9411885738372803, + "learning_rate": 4.939830094644637e-06, + "loss": 0.7625, + "step": 1230 + }, + { + "epoch": 0.45446714250553777, + "grad_norm": 0.9621394872665405, + "learning_rate": 4.939723982322667e-06, + "loss": 0.6874, + "step": 1231 + }, + { + "epoch": 0.45483632783657396, + "grad_norm": 0.928747296333313, + "learning_rate": 4.939617777657916e-06, + "loss": 0.761, + "step": 1232 + }, + { + "epoch": 0.45520551316761015, + "grad_norm": 0.8658984899520874, + "learning_rate": 4.939511480654401e-06, + "loss": 0.7426, + "step": 1233 + }, + { + "epoch": 0.45557469849864635, + "grad_norm": 0.901623010635376, + "learning_rate": 4.939405091316147e-06, + "loss": 0.7723, + "step": 1234 + }, + { + "epoch": 0.4559438838296825, + "grad_norm": 0.8813204765319824, + "learning_rate": 4.9392986096471796e-06, + "loss": 0.7268, + "step": 1235 + }, + { + "epoch": 0.4563130691607187, + "grad_norm": 0.9547492265701294, + "learning_rate": 4.93919203565153e-06, + "loss": 0.7687, + "step": 1236 + }, + { + "epoch": 0.45668225449175487, + "grad_norm": 0.9250112175941467, + "learning_rate": 4.939085369333232e-06, + "loss": 0.7749, + "step": 1237 + }, + { + "epoch": 0.45705143982279106, + "grad_norm": 0.9013698697090149, + "learning_rate": 4.938978610696322e-06, + "loss": 0.7553, + "step": 1238 + }, + { + "epoch": 0.4574206251538272, + "grad_norm": 0.8597696423530579, + "learning_rate": 4.938871759744842e-06, + "loss": 0.6982, + "step": 1239 + }, + { + "epoch": 0.4577898104848634, + "grad_norm": 0.9342607259750366, + "learning_rate": 4.938764816482835e-06, + "loss": 0.7266, + "step": 1240 + }, + { + "epoch": 0.4581589958158996, + "grad_norm": 0.9153682589530945, + "learning_rate": 4.93865778091435e-06, + "loss": 0.7474, + "step": 1241 + }, + { + "epoch": 0.4585281811469358, + "grad_norm": 0.9273695945739746, + "learning_rate": 4.938550653043437e-06, + "loss": 0.7094, + "step": 1242 + }, + { + "epoch": 0.45889736647797197, + "grad_norm": 0.9250311255455017, + "learning_rate": 4.938443432874151e-06, + "loss": 0.7576, + "step": 1243 + }, + { + "epoch": 0.4592665518090081, + "grad_norm": 0.9787151217460632, + "learning_rate": 4.938336120410551e-06, + "loss": 0.7324, + "step": 1244 + }, + { + "epoch": 0.4596357371400443, + "grad_norm": 0.8964402079582214, + "learning_rate": 4.938228715656699e-06, + "loss": 0.766, + "step": 1245 + }, + { + "epoch": 0.4600049224710805, + "grad_norm": 0.9307600259780884, + "learning_rate": 4.938121218616659e-06, + "loss": 0.7328, + "step": 1246 + }, + { + "epoch": 0.4603741078021167, + "grad_norm": 0.8917447328567505, + "learning_rate": 4.938013629294502e-06, + "loss": 0.7606, + "step": 1247 + }, + { + "epoch": 0.4607432931331528, + "grad_norm": 0.902154803276062, + "learning_rate": 4.937905947694296e-06, + "loss": 0.6913, + "step": 1248 + }, + { + "epoch": 0.461112478464189, + "grad_norm": 0.9088221192359924, + "learning_rate": 4.937798173820121e-06, + "loss": 0.7124, + "step": 1249 + }, + { + "epoch": 0.4614816637952252, + "grad_norm": 0.8867905735969543, + "learning_rate": 4.937690307676054e-06, + "loss": 0.741, + "step": 1250 + }, + { + "epoch": 0.4618508491262614, + "grad_norm": 0.9009400606155396, + "learning_rate": 4.937582349266178e-06, + "loss": 0.7393, + "step": 1251 + }, + { + "epoch": 0.4622200344572976, + "grad_norm": 0.9617549777030945, + "learning_rate": 4.937474298594579e-06, + "loss": 0.7683, + "step": 1252 + }, + { + "epoch": 0.4625892197883337, + "grad_norm": 0.9465776085853577, + "learning_rate": 4.937366155665348e-06, + "loss": 0.7684, + "step": 1253 + }, + { + "epoch": 0.4629584051193699, + "grad_norm": 0.9941141605377197, + "learning_rate": 4.9372579204825775e-06, + "loss": 0.7538, + "step": 1254 + }, + { + "epoch": 0.4633275904504061, + "grad_norm": 0.8829614520072937, + "learning_rate": 4.937149593050363e-06, + "loss": 0.7195, + "step": 1255 + }, + { + "epoch": 0.4636967757814423, + "grad_norm": 0.8967337608337402, + "learning_rate": 4.937041173372806e-06, + "loss": 0.7436, + "step": 1256 + }, + { + "epoch": 0.46406596111247844, + "grad_norm": 0.8753035664558411, + "learning_rate": 4.9369326614540096e-06, + "loss": 0.7431, + "step": 1257 + }, + { + "epoch": 0.46443514644351463, + "grad_norm": 0.9020246863365173, + "learning_rate": 4.936824057298081e-06, + "loss": 0.7362, + "step": 1258 + }, + { + "epoch": 0.4648043317745508, + "grad_norm": 0.8981832265853882, + "learning_rate": 4.936715360909131e-06, + "loss": 0.7306, + "step": 1259 + }, + { + "epoch": 0.465173517105587, + "grad_norm": 0.9331729412078857, + "learning_rate": 4.9366065722912735e-06, + "loss": 0.7471, + "step": 1260 + }, + { + "epoch": 0.4655427024366232, + "grad_norm": 0.9088561534881592, + "learning_rate": 4.936497691448627e-06, + "loss": 0.7693, + "step": 1261 + }, + { + "epoch": 0.46591188776765935, + "grad_norm": 0.9289308190345764, + "learning_rate": 4.936388718385311e-06, + "loss": 0.7338, + "step": 1262 + }, + { + "epoch": 0.46628107309869554, + "grad_norm": 0.9137311577796936, + "learning_rate": 4.936279653105452e-06, + "loss": 0.7147, + "step": 1263 + }, + { + "epoch": 0.46665025842973173, + "grad_norm": 0.8960225582122803, + "learning_rate": 4.936170495613175e-06, + "loss": 0.7154, + "step": 1264 + }, + { + "epoch": 0.4670194437607679, + "grad_norm": 0.8980303406715393, + "learning_rate": 4.936061245912615e-06, + "loss": 0.7259, + "step": 1265 + }, + { + "epoch": 0.46738862909180406, + "grad_norm": 0.9025071263313293, + "learning_rate": 4.935951904007906e-06, + "loss": 0.7525, + "step": 1266 + }, + { + "epoch": 0.46775781442284026, + "grad_norm": 0.8963534235954285, + "learning_rate": 4.935842469903186e-06, + "loss": 0.721, + "step": 1267 + }, + { + "epoch": 0.46812699975387645, + "grad_norm": 0.9002708196640015, + "learning_rate": 4.935732943602597e-06, + "loss": 0.7474, + "step": 1268 + }, + { + "epoch": 0.46849618508491264, + "grad_norm": 0.9217738509178162, + "learning_rate": 4.935623325110285e-06, + "loss": 0.7647, + "step": 1269 + }, + { + "epoch": 0.4688653704159488, + "grad_norm": 0.9251262545585632, + "learning_rate": 4.935513614430399e-06, + "loss": 0.732, + "step": 1270 + }, + { + "epoch": 0.46923455574698497, + "grad_norm": 0.8719298243522644, + "learning_rate": 4.935403811567091e-06, + "loss": 0.7526, + "step": 1271 + }, + { + "epoch": 0.46960374107802116, + "grad_norm": 0.893237292766571, + "learning_rate": 4.935293916524517e-06, + "loss": 0.7306, + "step": 1272 + }, + { + "epoch": 0.46997292640905736, + "grad_norm": 0.8918770551681519, + "learning_rate": 4.935183929306837e-06, + "loss": 0.7181, + "step": 1273 + }, + { + "epoch": 0.47034211174009355, + "grad_norm": 0.914319634437561, + "learning_rate": 4.935073849918214e-06, + "loss": 0.7679, + "step": 1274 + }, + { + "epoch": 0.4707112970711297, + "grad_norm": 0.895769476890564, + "learning_rate": 4.934963678362815e-06, + "loss": 0.7601, + "step": 1275 + }, + { + "epoch": 0.4710804824021659, + "grad_norm": 0.8876564502716064, + "learning_rate": 4.934853414644808e-06, + "loss": 0.733, + "step": 1276 + }, + { + "epoch": 0.47144966773320207, + "grad_norm": 0.9218546152114868, + "learning_rate": 4.934743058768369e-06, + "loss": 0.7578, + "step": 1277 + }, + { + "epoch": 0.47181885306423826, + "grad_norm": 0.9365533590316772, + "learning_rate": 4.934632610737673e-06, + "loss": 0.7511, + "step": 1278 + }, + { + "epoch": 0.4721880383952744, + "grad_norm": 0.8840087652206421, + "learning_rate": 4.934522070556901e-06, + "loss": 0.7085, + "step": 1279 + }, + { + "epoch": 0.4725572237263106, + "grad_norm": 0.8947144150733948, + "learning_rate": 4.934411438230237e-06, + "loss": 0.7547, + "step": 1280 + }, + { + "epoch": 0.4729264090573468, + "grad_norm": 0.9141665697097778, + "learning_rate": 4.934300713761868e-06, + "loss": 0.7278, + "step": 1281 + }, + { + "epoch": 0.473295594388383, + "grad_norm": 0.8770225048065186, + "learning_rate": 4.9341898971559856e-06, + "loss": 0.7245, + "step": 1282 + }, + { + "epoch": 0.47366477971941917, + "grad_norm": 0.8997796177864075, + "learning_rate": 4.934078988416784e-06, + "loss": 0.775, + "step": 1283 + }, + { + "epoch": 0.4740339650504553, + "grad_norm": 0.9298402667045593, + "learning_rate": 4.933967987548461e-06, + "loss": 0.7564, + "step": 1284 + }, + { + "epoch": 0.4744031503814915, + "grad_norm": 0.9105640053749084, + "learning_rate": 4.933856894555218e-06, + "loss": 0.7507, + "step": 1285 + }, + { + "epoch": 0.4747723357125277, + "grad_norm": 0.8863377571105957, + "learning_rate": 4.933745709441259e-06, + "loss": 0.7113, + "step": 1286 + }, + { + "epoch": 0.4751415210435639, + "grad_norm": 0.897258996963501, + "learning_rate": 4.9336344322107935e-06, + "loss": 0.7776, + "step": 1287 + }, + { + "epoch": 0.4755107063746, + "grad_norm": 0.8783311247825623, + "learning_rate": 4.933523062868033e-06, + "loss": 0.7136, + "step": 1288 + }, + { + "epoch": 0.4758798917056362, + "grad_norm": 0.8907281160354614, + "learning_rate": 4.933411601417192e-06, + "loss": 0.6871, + "step": 1289 + }, + { + "epoch": 0.4762490770366724, + "grad_norm": 0.9048116207122803, + "learning_rate": 4.93330004786249e-06, + "loss": 0.7545, + "step": 1290 + }, + { + "epoch": 0.4766182623677086, + "grad_norm": 0.8691855669021606, + "learning_rate": 4.933188402208149e-06, + "loss": 0.703, + "step": 1291 + }, + { + "epoch": 0.4769874476987448, + "grad_norm": 0.8851851224899292, + "learning_rate": 4.933076664458395e-06, + "loss": 0.7349, + "step": 1292 + }, + { + "epoch": 0.47735663302978093, + "grad_norm": 0.9410663843154907, + "learning_rate": 4.9329648346174575e-06, + "loss": 0.7593, + "step": 1293 + }, + { + "epoch": 0.4777258183608171, + "grad_norm": 0.9117968082427979, + "learning_rate": 4.932852912689569e-06, + "loss": 0.7231, + "step": 1294 + }, + { + "epoch": 0.4780950036918533, + "grad_norm": 0.9473034143447876, + "learning_rate": 4.932740898678965e-06, + "loss": 0.7467, + "step": 1295 + }, + { + "epoch": 0.4784641890228895, + "grad_norm": 0.9036644697189331, + "learning_rate": 4.932628792589887e-06, + "loss": 0.7426, + "step": 1296 + }, + { + "epoch": 0.47883337435392564, + "grad_norm": 0.9032191038131714, + "learning_rate": 4.932516594426575e-06, + "loss": 0.7258, + "step": 1297 + }, + { + "epoch": 0.47920255968496184, + "grad_norm": 0.864815354347229, + "learning_rate": 4.932404304193279e-06, + "loss": 0.7088, + "step": 1298 + }, + { + "epoch": 0.47957174501599803, + "grad_norm": 0.8944458961486816, + "learning_rate": 4.9322919218942466e-06, + "loss": 0.7706, + "step": 1299 + }, + { + "epoch": 0.4799409303470342, + "grad_norm": 0.9362423419952393, + "learning_rate": 4.932179447533734e-06, + "loss": 0.7286, + "step": 1300 + }, + { + "epoch": 0.4803101156780704, + "grad_norm": 0.9102439284324646, + "learning_rate": 4.9320668811159954e-06, + "loss": 0.7096, + "step": 1301 + }, + { + "epoch": 0.48067930100910655, + "grad_norm": 0.8899404406547546, + "learning_rate": 4.931954222645294e-06, + "loss": 0.7413, + "step": 1302 + }, + { + "epoch": 0.48104848634014274, + "grad_norm": 0.9097535610198975, + "learning_rate": 4.9318414721258924e-06, + "loss": 0.7417, + "step": 1303 + }, + { + "epoch": 0.48141767167117894, + "grad_norm": 0.9498276114463806, + "learning_rate": 4.931728629562059e-06, + "loss": 0.724, + "step": 1304 + }, + { + "epoch": 0.48178685700221513, + "grad_norm": 0.9295927882194519, + "learning_rate": 4.9316156949580645e-06, + "loss": 0.7388, + "step": 1305 + }, + { + "epoch": 0.48215604233325127, + "grad_norm": 0.907588541507721, + "learning_rate": 4.931502668318183e-06, + "loss": 0.7486, + "step": 1306 + }, + { + "epoch": 0.48252522766428746, + "grad_norm": 0.9081675410270691, + "learning_rate": 4.9313895496466936e-06, + "loss": 0.7562, + "step": 1307 + }, + { + "epoch": 0.48289441299532365, + "grad_norm": 0.9324904084205627, + "learning_rate": 4.931276338947876e-06, + "loss": 0.7588, + "step": 1308 + }, + { + "epoch": 0.48326359832635984, + "grad_norm": 0.9412097930908203, + "learning_rate": 4.931163036226017e-06, + "loss": 0.7773, + "step": 1309 + }, + { + "epoch": 0.48363278365739604, + "grad_norm": 0.9143854975700378, + "learning_rate": 4.931049641485404e-06, + "loss": 0.7591, + "step": 1310 + }, + { + "epoch": 0.4840019689884322, + "grad_norm": 0.8855016827583313, + "learning_rate": 4.930936154730329e-06, + "loss": 0.7749, + "step": 1311 + }, + { + "epoch": 0.48437115431946837, + "grad_norm": 0.9048222303390503, + "learning_rate": 4.930822575965089e-06, + "loss": 0.7268, + "step": 1312 + }, + { + "epoch": 0.48474033965050456, + "grad_norm": 0.9106447100639343, + "learning_rate": 4.93070890519398e-06, + "loss": 0.7537, + "step": 1313 + }, + { + "epoch": 0.48510952498154075, + "grad_norm": 0.9264537692070007, + "learning_rate": 4.930595142421307e-06, + "loss": 0.7544, + "step": 1314 + }, + { + "epoch": 0.4854787103125769, + "grad_norm": 0.9199881553649902, + "learning_rate": 4.930481287651375e-06, + "loss": 0.7032, + "step": 1315 + }, + { + "epoch": 0.4858478956436131, + "grad_norm": 0.9257407188415527, + "learning_rate": 4.930367340888494e-06, + "loss": 0.7343, + "step": 1316 + }, + { + "epoch": 0.4862170809746493, + "grad_norm": 0.9798755645751953, + "learning_rate": 4.930253302136976e-06, + "loss": 0.7448, + "step": 1317 + }, + { + "epoch": 0.48658626630568547, + "grad_norm": 0.919750988483429, + "learning_rate": 4.930139171401136e-06, + "loss": 0.7061, + "step": 1318 + }, + { + "epoch": 0.48695545163672166, + "grad_norm": 0.9479880332946777, + "learning_rate": 4.930024948685297e-06, + "loss": 0.7962, + "step": 1319 + }, + { + "epoch": 0.4873246369677578, + "grad_norm": 0.9518943428993225, + "learning_rate": 4.92991063399378e-06, + "loss": 0.7174, + "step": 1320 + }, + { + "epoch": 0.487693822298794, + "grad_norm": 0.908926784992218, + "learning_rate": 4.929796227330912e-06, + "loss": 0.7462, + "step": 1321 + }, + { + "epoch": 0.4880630076298302, + "grad_norm": 0.8598317503929138, + "learning_rate": 4.929681728701023e-06, + "loss": 0.7367, + "step": 1322 + }, + { + "epoch": 0.4884321929608664, + "grad_norm": 0.871263325214386, + "learning_rate": 4.929567138108449e-06, + "loss": 0.7283, + "step": 1323 + }, + { + "epoch": 0.4888013782919025, + "grad_norm": 0.8650959134101868, + "learning_rate": 4.9294524555575255e-06, + "loss": 0.703, + "step": 1324 + }, + { + "epoch": 0.4891705636229387, + "grad_norm": 0.9248819351196289, + "learning_rate": 4.9293376810525925e-06, + "loss": 0.7485, + "step": 1325 + }, + { + "epoch": 0.4895397489539749, + "grad_norm": 0.9359372854232788, + "learning_rate": 4.929222814597995e-06, + "loss": 0.7505, + "step": 1326 + }, + { + "epoch": 0.4899089342850111, + "grad_norm": 0.9581688046455383, + "learning_rate": 4.929107856198081e-06, + "loss": 0.747, + "step": 1327 + }, + { + "epoch": 0.4902781196160473, + "grad_norm": 0.9165839552879333, + "learning_rate": 4.928992805857201e-06, + "loss": 0.7406, + "step": 1328 + }, + { + "epoch": 0.4906473049470834, + "grad_norm": 0.9622183442115784, + "learning_rate": 4.9288776635797105e-06, + "loss": 0.7578, + "step": 1329 + }, + { + "epoch": 0.4910164902781196, + "grad_norm": 0.9130443930625916, + "learning_rate": 4.928762429369966e-06, + "loss": 0.7165, + "step": 1330 + }, + { + "epoch": 0.4913856756091558, + "grad_norm": 0.9140217304229736, + "learning_rate": 4.928647103232331e-06, + "loss": 0.7387, + "step": 1331 + }, + { + "epoch": 0.491754860940192, + "grad_norm": 0.9288978576660156, + "learning_rate": 4.928531685171169e-06, + "loss": 0.732, + "step": 1332 + }, + { + "epoch": 0.49212404627122813, + "grad_norm": 0.9056506156921387, + "learning_rate": 4.92841617519085e-06, + "loss": 0.6913, + "step": 1333 + }, + { + "epoch": 0.4924932316022643, + "grad_norm": 0.8599223494529724, + "learning_rate": 4.928300573295744e-06, + "loss": 0.7318, + "step": 1334 + }, + { + "epoch": 0.4928624169333005, + "grad_norm": 0.9114431142807007, + "learning_rate": 4.928184879490228e-06, + "loss": 0.6987, + "step": 1335 + }, + { + "epoch": 0.4932316022643367, + "grad_norm": 0.910118579864502, + "learning_rate": 4.9280690937786815e-06, + "loss": 0.7212, + "step": 1336 + }, + { + "epoch": 0.4936007875953729, + "grad_norm": 0.9045255780220032, + "learning_rate": 4.927953216165486e-06, + "loss": 0.692, + "step": 1337 + }, + { + "epoch": 0.49396997292640904, + "grad_norm": 0.9406313896179199, + "learning_rate": 4.927837246655027e-06, + "loss": 0.7439, + "step": 1338 + }, + { + "epoch": 0.49433915825744523, + "grad_norm": 0.9328321218490601, + "learning_rate": 4.9277211852516945e-06, + "loss": 0.7221, + "step": 1339 + }, + { + "epoch": 0.4947083435884814, + "grad_norm": 0.9344122409820557, + "learning_rate": 4.927605031959882e-06, + "loss": 0.7485, + "step": 1340 + }, + { + "epoch": 0.4950775289195176, + "grad_norm": 0.8839752674102783, + "learning_rate": 4.9274887867839845e-06, + "loss": 0.7085, + "step": 1341 + }, + { + "epoch": 0.49544671425055375, + "grad_norm": 0.9064518809318542, + "learning_rate": 4.9273724497284025e-06, + "loss": 0.7123, + "step": 1342 + }, + { + "epoch": 0.49581589958158995, + "grad_norm": 0.8806946873664856, + "learning_rate": 4.9272560207975395e-06, + "loss": 0.7362, + "step": 1343 + }, + { + "epoch": 0.49618508491262614, + "grad_norm": 0.9120421409606934, + "learning_rate": 4.9271394999958025e-06, + "loss": 0.7271, + "step": 1344 + }, + { + "epoch": 0.49655427024366233, + "grad_norm": 0.8838410377502441, + "learning_rate": 4.927022887327601e-06, + "loss": 0.7167, + "step": 1345 + }, + { + "epoch": 0.4969234555746985, + "grad_norm": 0.8578704595565796, + "learning_rate": 4.926906182797349e-06, + "loss": 0.6924, + "step": 1346 + }, + { + "epoch": 0.49729264090573466, + "grad_norm": 0.8794922828674316, + "learning_rate": 4.9267893864094644e-06, + "loss": 0.6941, + "step": 1347 + }, + { + "epoch": 0.49766182623677085, + "grad_norm": 0.902854323387146, + "learning_rate": 4.926672498168368e-06, + "loss": 0.7546, + "step": 1348 + }, + { + "epoch": 0.49803101156780705, + "grad_norm": 0.9118044376373291, + "learning_rate": 4.926555518078482e-06, + "loss": 0.7321, + "step": 1349 + }, + { + "epoch": 0.49840019689884324, + "grad_norm": 0.9261218905448914, + "learning_rate": 4.926438446144237e-06, + "loss": 0.7313, + "step": 1350 + }, + { + "epoch": 0.4987693822298794, + "grad_norm": 0.887624979019165, + "learning_rate": 4.9263212823700616e-06, + "loss": 0.7024, + "step": 1351 + }, + { + "epoch": 0.49913856756091557, + "grad_norm": 0.899479329586029, + "learning_rate": 4.926204026760392e-06, + "loss": 0.7183, + "step": 1352 + }, + { + "epoch": 0.49950775289195176, + "grad_norm": 0.8791046142578125, + "learning_rate": 4.926086679319665e-06, + "loss": 0.7311, + "step": 1353 + }, + { + "epoch": 0.49987693822298795, + "grad_norm": 0.9053353667259216, + "learning_rate": 4.925969240052323e-06, + "loss": 0.7332, + "step": 1354 + }, + { + "epoch": 0.5002461235540241, + "grad_norm": 0.8757712244987488, + "learning_rate": 4.925851708962811e-06, + "loss": 0.748, + "step": 1355 + }, + { + "epoch": 0.5006153088850603, + "grad_norm": 0.9180270433425903, + "learning_rate": 4.925734086055578e-06, + "loss": 0.7304, + "step": 1356 + }, + { + "epoch": 0.5009844942160965, + "grad_norm": 0.8982318043708801, + "learning_rate": 4.9256163713350745e-06, + "loss": 0.7179, + "step": 1357 + }, + { + "epoch": 0.5013536795471326, + "grad_norm": 0.8590686321258545, + "learning_rate": 4.925498564805757e-06, + "loss": 0.6921, + "step": 1358 + }, + { + "epoch": 0.5017228648781689, + "grad_norm": 0.9188492894172668, + "learning_rate": 4.925380666472085e-06, + "loss": 0.719, + "step": 1359 + }, + { + "epoch": 0.502092050209205, + "grad_norm": 0.8929955363273621, + "learning_rate": 4.92526267633852e-06, + "loss": 0.7179, + "step": 1360 + }, + { + "epoch": 0.5024612355402412, + "grad_norm": 0.9286133646965027, + "learning_rate": 4.925144594409528e-06, + "loss": 0.7686, + "step": 1361 + }, + { + "epoch": 0.5028304208712774, + "grad_norm": 0.869183361530304, + "learning_rate": 4.925026420689579e-06, + "loss": 0.6961, + "step": 1362 + }, + { + "epoch": 0.5031996062023135, + "grad_norm": 0.8674918413162231, + "learning_rate": 4.924908155183145e-06, + "loss": 0.7365, + "step": 1363 + }, + { + "epoch": 0.5035687915333498, + "grad_norm": 0.9061448574066162, + "learning_rate": 4.924789797894701e-06, + "loss": 0.7142, + "step": 1364 + }, + { + "epoch": 0.5039379768643859, + "grad_norm": 0.9563679695129395, + "learning_rate": 4.924671348828731e-06, + "loss": 0.7383, + "step": 1365 + }, + { + "epoch": 0.5043071621954222, + "grad_norm": 0.8675025701522827, + "learning_rate": 4.924552807989715e-06, + "loss": 0.7392, + "step": 1366 + }, + { + "epoch": 0.5046763475264583, + "grad_norm": 0.8793214559555054, + "learning_rate": 4.9244341753821396e-06, + "loss": 0.7712, + "step": 1367 + }, + { + "epoch": 0.5050455328574944, + "grad_norm": 0.9035436511039734, + "learning_rate": 4.924315451010496e-06, + "loss": 0.7131, + "step": 1368 + }, + { + "epoch": 0.5054147181885307, + "grad_norm": 0.8869012594223022, + "learning_rate": 4.924196634879278e-06, + "loss": 0.73, + "step": 1369 + }, + { + "epoch": 0.5057839035195668, + "grad_norm": 0.9123469591140747, + "learning_rate": 4.9240777269929825e-06, + "loss": 0.7164, + "step": 1370 + }, + { + "epoch": 0.506153088850603, + "grad_norm": 0.8680907487869263, + "learning_rate": 4.923958727356109e-06, + "loss": 0.7002, + "step": 1371 + }, + { + "epoch": 0.5065222741816392, + "grad_norm": 0.9459185004234314, + "learning_rate": 4.923839635973165e-06, + "loss": 0.7234, + "step": 1372 + }, + { + "epoch": 0.5068914595126753, + "grad_norm": 0.8962308168411255, + "learning_rate": 4.923720452848653e-06, + "loss": 0.7349, + "step": 1373 + }, + { + "epoch": 0.5072606448437116, + "grad_norm": 0.8820279836654663, + "learning_rate": 4.92360117798709e-06, + "loss": 0.7304, + "step": 1374 + }, + { + "epoch": 0.5076298301747477, + "grad_norm": 0.8836473226547241, + "learning_rate": 4.923481811392985e-06, + "loss": 0.749, + "step": 1375 + }, + { + "epoch": 0.5079990155057839, + "grad_norm": 0.8512134552001953, + "learning_rate": 4.923362353070859e-06, + "loss": 0.7129, + "step": 1376 + }, + { + "epoch": 0.5083682008368201, + "grad_norm": 0.8974927663803101, + "learning_rate": 4.923242803025232e-06, + "loss": 0.6806, + "step": 1377 + }, + { + "epoch": 0.5087373861678562, + "grad_norm": 0.8919023275375366, + "learning_rate": 4.92312316126063e-06, + "loss": 0.7334, + "step": 1378 + }, + { + "epoch": 0.5091065714988925, + "grad_norm": 0.9181932210922241, + "learning_rate": 4.923003427781582e-06, + "loss": 0.7337, + "step": 1379 + }, + { + "epoch": 0.5094757568299286, + "grad_norm": 0.8906370997428894, + "learning_rate": 4.9228836025926185e-06, + "loss": 0.7356, + "step": 1380 + }, + { + "epoch": 0.5098449421609648, + "grad_norm": 1.0170215368270874, + "learning_rate": 4.922763685698275e-06, + "loss": 0.7247, + "step": 1381 + }, + { + "epoch": 0.510214127492001, + "grad_norm": 0.898322343826294, + "learning_rate": 4.922643677103091e-06, + "loss": 0.7254, + "step": 1382 + }, + { + "epoch": 0.5105833128230371, + "grad_norm": 0.9089271426200867, + "learning_rate": 4.922523576811607e-06, + "loss": 0.7618, + "step": 1383 + }, + { + "epoch": 0.5109524981540734, + "grad_norm": 0.9560837745666504, + "learning_rate": 4.922403384828373e-06, + "loss": 0.7461, + "step": 1384 + }, + { + "epoch": 0.5113216834851095, + "grad_norm": 0.9104019403457642, + "learning_rate": 4.922283101157933e-06, + "loss": 0.7622, + "step": 1385 + }, + { + "epoch": 0.5116908688161457, + "grad_norm": 0.8937442898750305, + "learning_rate": 4.922162725804843e-06, + "loss": 0.6968, + "step": 1386 + }, + { + "epoch": 0.5120600541471819, + "grad_norm": 0.9203770160675049, + "learning_rate": 4.922042258773658e-06, + "loss": 0.7144, + "step": 1387 + }, + { + "epoch": 0.5124292394782181, + "grad_norm": 0.9218546748161316, + "learning_rate": 4.921921700068938e-06, + "loss": 0.7849, + "step": 1388 + }, + { + "epoch": 0.5127984248092542, + "grad_norm": 0.9261901378631592, + "learning_rate": 4.921801049695246e-06, + "loss": 0.744, + "step": 1389 + }, + { + "epoch": 0.5131676101402904, + "grad_norm": 0.8870830535888672, + "learning_rate": 4.92168030765715e-06, + "loss": 0.719, + "step": 1390 + }, + { + "epoch": 0.5135367954713266, + "grad_norm": 0.8306862711906433, + "learning_rate": 4.921559473959217e-06, + "loss": 0.6566, + "step": 1391 + }, + { + "epoch": 0.5139059808023628, + "grad_norm": 0.9043039083480835, + "learning_rate": 4.921438548606022e-06, + "loss": 0.7366, + "step": 1392 + }, + { + "epoch": 0.514275166133399, + "grad_norm": 0.914897084236145, + "learning_rate": 4.921317531602143e-06, + "loss": 0.7372, + "step": 1393 + }, + { + "epoch": 0.5146443514644351, + "grad_norm": 0.8624985814094543, + "learning_rate": 4.921196422952159e-06, + "loss": 0.771, + "step": 1394 + }, + { + "epoch": 0.5150135367954713, + "grad_norm": 0.8990124464035034, + "learning_rate": 4.921075222660655e-06, + "loss": 0.7263, + "step": 1395 + }, + { + "epoch": 0.5153827221265075, + "grad_norm": 0.906343400478363, + "learning_rate": 4.920953930732217e-06, + "loss": 0.732, + "step": 1396 + }, + { + "epoch": 0.5157519074575437, + "grad_norm": 0.8977361917495728, + "learning_rate": 4.920832547171438e-06, + "loss": 0.7035, + "step": 1397 + }, + { + "epoch": 0.5161210927885799, + "grad_norm": 0.8892781138420105, + "learning_rate": 4.920711071982911e-06, + "loss": 0.7441, + "step": 1398 + }, + { + "epoch": 0.516490278119616, + "grad_norm": 0.8844077587127686, + "learning_rate": 4.920589505171234e-06, + "loss": 0.7602, + "step": 1399 + }, + { + "epoch": 0.5168594634506523, + "grad_norm": 0.9034737348556519, + "learning_rate": 4.9204678467410075e-06, + "loss": 0.7123, + "step": 1400 + }, + { + "epoch": 0.5172286487816884, + "grad_norm": 0.9292963147163391, + "learning_rate": 4.920346096696837e-06, + "loss": 0.7266, + "step": 1401 + }, + { + "epoch": 0.5175978341127246, + "grad_norm": 0.9953920245170593, + "learning_rate": 4.920224255043331e-06, + "loss": 0.757, + "step": 1402 + }, + { + "epoch": 0.5179670194437608, + "grad_norm": 0.8745222091674805, + "learning_rate": 4.9201023217851e-06, + "loss": 0.7326, + "step": 1403 + }, + { + "epoch": 0.5183362047747969, + "grad_norm": 0.9055063724517822, + "learning_rate": 4.919980296926761e-06, + "loss": 0.7127, + "step": 1404 + }, + { + "epoch": 0.5187053901058332, + "grad_norm": 0.9204325079917908, + "learning_rate": 4.91985818047293e-06, + "loss": 0.7741, + "step": 1405 + }, + { + "epoch": 0.5190745754368693, + "grad_norm": 0.9215565919876099, + "learning_rate": 4.919735972428232e-06, + "loss": 0.7451, + "step": 1406 + }, + { + "epoch": 0.5194437607679054, + "grad_norm": 0.8616822361946106, + "learning_rate": 4.919613672797291e-06, + "loss": 0.6946, + "step": 1407 + }, + { + "epoch": 0.5198129460989417, + "grad_norm": 0.8958667516708374, + "learning_rate": 4.919491281584736e-06, + "loss": 0.7137, + "step": 1408 + }, + { + "epoch": 0.5201821314299778, + "grad_norm": 0.9338605999946594, + "learning_rate": 4.919368798795199e-06, + "loss": 0.7135, + "step": 1409 + }, + { + "epoch": 0.5205513167610141, + "grad_norm": 0.9158616662025452, + "learning_rate": 4.919246224433317e-06, + "loss": 0.7253, + "step": 1410 + }, + { + "epoch": 0.5209205020920502, + "grad_norm": 0.9109562635421753, + "learning_rate": 4.919123558503729e-06, + "loss": 0.7315, + "step": 1411 + }, + { + "epoch": 0.5212896874230863, + "grad_norm": 0.8872061967849731, + "learning_rate": 4.919000801011078e-06, + "loss": 0.7384, + "step": 1412 + }, + { + "epoch": 0.5216588727541226, + "grad_norm": 0.852870523929596, + "learning_rate": 4.918877951960009e-06, + "loss": 0.6982, + "step": 1413 + }, + { + "epoch": 0.5220280580851587, + "grad_norm": 0.8745459914207458, + "learning_rate": 4.918755011355174e-06, + "loss": 0.7137, + "step": 1414 + }, + { + "epoch": 0.522397243416195, + "grad_norm": 0.8926501274108887, + "learning_rate": 4.918631979201225e-06, + "loss": 0.7493, + "step": 1415 + }, + { + "epoch": 0.5227664287472311, + "grad_norm": 0.9324433207511902, + "learning_rate": 4.918508855502819e-06, + "loss": 0.6959, + "step": 1416 + }, + { + "epoch": 0.5231356140782673, + "grad_norm": 0.9326198697090149, + "learning_rate": 4.918385640264615e-06, + "loss": 0.7399, + "step": 1417 + }, + { + "epoch": 0.5235047994093035, + "grad_norm": 0.893012285232544, + "learning_rate": 4.9182623334912796e-06, + "loss": 0.7212, + "step": 1418 + }, + { + "epoch": 0.5238739847403396, + "grad_norm": 0.8747656941413879, + "learning_rate": 4.918138935187478e-06, + "loss": 0.7471, + "step": 1419 + }, + { + "epoch": 0.5242431700713759, + "grad_norm": 0.8935147523880005, + "learning_rate": 4.91801544535788e-06, + "loss": 0.7155, + "step": 1420 + }, + { + "epoch": 0.524612355402412, + "grad_norm": 0.9232352375984192, + "learning_rate": 4.91789186400716e-06, + "loss": 0.7442, + "step": 1421 + }, + { + "epoch": 0.5249815407334482, + "grad_norm": 0.8894848227500916, + "learning_rate": 4.917768191139997e-06, + "loss": 0.7587, + "step": 1422 + }, + { + "epoch": 0.5253507260644844, + "grad_norm": 0.8878769278526306, + "learning_rate": 4.91764442676107e-06, + "loss": 0.7335, + "step": 1423 + }, + { + "epoch": 0.5257199113955205, + "grad_norm": 1.0391231775283813, + "learning_rate": 4.917520570875065e-06, + "loss": 0.7053, + "step": 1424 + }, + { + "epoch": 0.5260890967265567, + "grad_norm": 0.9046094417572021, + "learning_rate": 4.91739662348667e-06, + "loss": 0.74, + "step": 1425 + }, + { + "epoch": 0.5264582820575929, + "grad_norm": 0.8990216851234436, + "learning_rate": 4.917272584600575e-06, + "loss": 0.7539, + "step": 1426 + }, + { + "epoch": 0.5268274673886291, + "grad_norm": 0.876078724861145, + "learning_rate": 4.917148454221477e-06, + "loss": 0.7024, + "step": 1427 + }, + { + "epoch": 0.5271966527196653, + "grad_norm": 0.9383296966552734, + "learning_rate": 4.917024232354071e-06, + "loss": 0.706, + "step": 1428 + }, + { + "epoch": 0.5275658380507015, + "grad_norm": 0.9111572504043579, + "learning_rate": 4.916899919003062e-06, + "loss": 0.7463, + "step": 1429 + }, + { + "epoch": 0.5279350233817376, + "grad_norm": 0.9223730564117432, + "learning_rate": 4.916775514173153e-06, + "loss": 0.759, + "step": 1430 + }, + { + "epoch": 0.5283042087127738, + "grad_norm": 1.1159535646438599, + "learning_rate": 4.916651017869054e-06, + "loss": 0.7213, + "step": 1431 + }, + { + "epoch": 0.52867339404381, + "grad_norm": 0.8813338279724121, + "learning_rate": 4.9165264300954765e-06, + "loss": 0.7215, + "step": 1432 + }, + { + "epoch": 0.5290425793748462, + "grad_norm": 0.90069180727005, + "learning_rate": 4.916401750857136e-06, + "loss": 0.738, + "step": 1433 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.8849632740020752, + "learning_rate": 4.9162769801587515e-06, + "loss": 0.6921, + "step": 1434 + }, + { + "epoch": 0.5297809500369185, + "grad_norm": 0.9068708419799805, + "learning_rate": 4.916152118005046e-06, + "loss": 0.7209, + "step": 1435 + }, + { + "epoch": 0.5301501353679547, + "grad_norm": 0.8788166046142578, + "learning_rate": 4.916027164400746e-06, + "loss": 0.6902, + "step": 1436 + }, + { + "epoch": 0.5305193206989909, + "grad_norm": 0.8957372903823853, + "learning_rate": 4.9159021193505806e-06, + "loss": 0.7313, + "step": 1437 + }, + { + "epoch": 0.530888506030027, + "grad_norm": 0.9063811898231506, + "learning_rate": 4.915776982859282e-06, + "loss": 0.7097, + "step": 1438 + }, + { + "epoch": 0.5312576913610633, + "grad_norm": 0.8811819553375244, + "learning_rate": 4.9156517549315875e-06, + "loss": 0.7085, + "step": 1439 + }, + { + "epoch": 0.5316268766920994, + "grad_norm": 0.9121610522270203, + "learning_rate": 4.915526435572235e-06, + "loss": 0.753, + "step": 1440 + }, + { + "epoch": 0.5319960620231357, + "grad_norm": 0.8891005516052246, + "learning_rate": 4.915401024785971e-06, + "loss": 0.7245, + "step": 1441 + }, + { + "epoch": 0.5323652473541718, + "grad_norm": 0.9231831431388855, + "learning_rate": 4.915275522577539e-06, + "loss": 0.7305, + "step": 1442 + }, + { + "epoch": 0.5327344326852079, + "grad_norm": 0.9318893551826477, + "learning_rate": 4.915149928951693e-06, + "loss": 0.7543, + "step": 1443 + }, + { + "epoch": 0.5331036180162442, + "grad_norm": 0.8818265199661255, + "learning_rate": 4.915024243913182e-06, + "loss": 0.6775, + "step": 1444 + }, + { + "epoch": 0.5334728033472803, + "grad_norm": 0.8918500542640686, + "learning_rate": 4.9148984674667675e-06, + "loss": 0.7264, + "step": 1445 + }, + { + "epoch": 0.5338419886783166, + "grad_norm": 0.90740966796875, + "learning_rate": 4.914772599617207e-06, + "loss": 0.6985, + "step": 1446 + }, + { + "epoch": 0.5342111740093527, + "grad_norm": 0.8859344720840454, + "learning_rate": 4.914646640369266e-06, + "loss": 0.7361, + "step": 1447 + }, + { + "epoch": 0.5345803593403888, + "grad_norm": 0.9221896529197693, + "learning_rate": 4.914520589727712e-06, + "loss": 0.7234, + "step": 1448 + }, + { + "epoch": 0.5349495446714251, + "grad_norm": 0.925670862197876, + "learning_rate": 4.9143944476973146e-06, + "loss": 0.7197, + "step": 1449 + }, + { + "epoch": 0.5353187300024612, + "grad_norm": 0.9094201326370239, + "learning_rate": 4.91426821428285e-06, + "loss": 0.7479, + "step": 1450 + }, + { + "epoch": 0.5356879153334975, + "grad_norm": 0.8817557096481323, + "learning_rate": 4.914141889489095e-06, + "loss": 0.757, + "step": 1451 + }, + { + "epoch": 0.5360571006645336, + "grad_norm": 0.8657225966453552, + "learning_rate": 4.914015473320833e-06, + "loss": 0.7251, + "step": 1452 + }, + { + "epoch": 0.5364262859955697, + "grad_norm": 0.8747798204421997, + "learning_rate": 4.913888965782846e-06, + "loss": 0.7471, + "step": 1453 + }, + { + "epoch": 0.536795471326606, + "grad_norm": 0.898120105266571, + "learning_rate": 4.913762366879924e-06, + "loss": 0.7839, + "step": 1454 + }, + { + "epoch": 0.5371646566576421, + "grad_norm": 0.9788367748260498, + "learning_rate": 4.913635676616858e-06, + "loss": 0.7449, + "step": 1455 + }, + { + "epoch": 0.5375338419886783, + "grad_norm": 0.8859315514564514, + "learning_rate": 4.9135088949984425e-06, + "loss": 0.7056, + "step": 1456 + }, + { + "epoch": 0.5379030273197145, + "grad_norm": 0.8957391977310181, + "learning_rate": 4.913382022029478e-06, + "loss": 0.7059, + "step": 1457 + }, + { + "epoch": 0.5382722126507506, + "grad_norm": 0.8686297535896301, + "learning_rate": 4.913255057714765e-06, + "loss": 0.7012, + "step": 1458 + }, + { + "epoch": 0.5386413979817869, + "grad_norm": 0.8956363797187805, + "learning_rate": 4.913128002059111e-06, + "loss": 0.728, + "step": 1459 + }, + { + "epoch": 0.539010583312823, + "grad_norm": 0.8763745427131653, + "learning_rate": 4.913000855067323e-06, + "loss": 0.7409, + "step": 1460 + }, + { + "epoch": 0.5393797686438592, + "grad_norm": 0.9013528823852539, + "learning_rate": 4.912873616744213e-06, + "loss": 0.7157, + "step": 1461 + }, + { + "epoch": 0.5397489539748954, + "grad_norm": 0.9581606984138489, + "learning_rate": 4.9127462870945995e-06, + "loss": 0.719, + "step": 1462 + }, + { + "epoch": 0.5401181393059316, + "grad_norm": 0.8720983266830444, + "learning_rate": 4.912618866123301e-06, + "loss": 0.6927, + "step": 1463 + }, + { + "epoch": 0.5404873246369678, + "grad_norm": 0.8839109539985657, + "learning_rate": 4.912491353835138e-06, + "loss": 0.7265, + "step": 1464 + }, + { + "epoch": 0.5408565099680039, + "grad_norm": 0.9057873487472534, + "learning_rate": 4.91236375023494e-06, + "loss": 0.7321, + "step": 1465 + }, + { + "epoch": 0.5412256952990401, + "grad_norm": 0.8765888214111328, + "learning_rate": 4.912236055327535e-06, + "loss": 0.7096, + "step": 1466 + }, + { + "epoch": 0.5415948806300763, + "grad_norm": 0.9088855981826782, + "learning_rate": 4.912108269117757e-06, + "loss": 0.7158, + "step": 1467 + }, + { + "epoch": 0.5419640659611125, + "grad_norm": 0.9135664105415344, + "learning_rate": 4.911980391610442e-06, + "loss": 0.7148, + "step": 1468 + }, + { + "epoch": 0.5423332512921487, + "grad_norm": 0.8987833857536316, + "learning_rate": 4.91185242281043e-06, + "loss": 0.6955, + "step": 1469 + }, + { + "epoch": 0.5427024366231848, + "grad_norm": 0.890201985836029, + "learning_rate": 4.911724362722566e-06, + "loss": 0.7131, + "step": 1470 + }, + { + "epoch": 0.543071621954221, + "grad_norm": 0.9337875247001648, + "learning_rate": 4.911596211351695e-06, + "loss": 0.7518, + "step": 1471 + }, + { + "epoch": 0.5434408072852572, + "grad_norm": 0.9150261282920837, + "learning_rate": 4.911467968702669e-06, + "loss": 0.7283, + "step": 1472 + }, + { + "epoch": 0.5438099926162934, + "grad_norm": 0.9131171107292175, + "learning_rate": 4.911339634780341e-06, + "loss": 0.7292, + "step": 1473 + }, + { + "epoch": 0.5441791779473295, + "grad_norm": 0.9012570977210999, + "learning_rate": 4.91121120958957e-06, + "loss": 0.7185, + "step": 1474 + }, + { + "epoch": 0.5445483632783658, + "grad_norm": 0.9188559651374817, + "learning_rate": 4.9110826931352145e-06, + "loss": 0.7277, + "step": 1475 + }, + { + "epoch": 0.5449175486094019, + "grad_norm": 0.9295513033866882, + "learning_rate": 4.91095408542214e-06, + "loss": 0.7768, + "step": 1476 + }, + { + "epoch": 0.5452867339404381, + "grad_norm": 0.895024836063385, + "learning_rate": 4.910825386455215e-06, + "loss": 0.7565, + "step": 1477 + }, + { + "epoch": 0.5456559192714743, + "grad_norm": 0.8578177094459534, + "learning_rate": 4.91069659623931e-06, + "loss": 0.6794, + "step": 1478 + }, + { + "epoch": 0.5460251046025104, + "grad_norm": 0.8829843401908875, + "learning_rate": 4.9105677147792996e-06, + "loss": 0.6603, + "step": 1479 + }, + { + "epoch": 0.5463942899335467, + "grad_norm": 0.8859395980834961, + "learning_rate": 4.910438742080061e-06, + "loss": 0.7196, + "step": 1480 + }, + { + "epoch": 0.5467634752645828, + "grad_norm": 0.8851184844970703, + "learning_rate": 4.910309678146478e-06, + "loss": 0.7091, + "step": 1481 + }, + { + "epoch": 0.547132660595619, + "grad_norm": 0.8768782615661621, + "learning_rate": 4.910180522983434e-06, + "loss": 0.7177, + "step": 1482 + }, + { + "epoch": 0.5475018459266552, + "grad_norm": 0.8919864892959595, + "learning_rate": 4.910051276595818e-06, + "loss": 0.7121, + "step": 1483 + }, + { + "epoch": 0.5478710312576913, + "grad_norm": 0.9079790115356445, + "learning_rate": 4.909921938988521e-06, + "loss": 0.7072, + "step": 1484 + }, + { + "epoch": 0.5482402165887276, + "grad_norm": 0.9404414892196655, + "learning_rate": 4.90979251016644e-06, + "loss": 0.7396, + "step": 1485 + }, + { + "epoch": 0.5486094019197637, + "grad_norm": 0.8725743293762207, + "learning_rate": 4.909662990134473e-06, + "loss": 0.6942, + "step": 1486 + }, + { + "epoch": 0.5489785872508, + "grad_norm": 0.90342777967453, + "learning_rate": 4.909533378897522e-06, + "loss": 0.7341, + "step": 1487 + }, + { + "epoch": 0.5493477725818361, + "grad_norm": 0.9160227179527283, + "learning_rate": 4.909403676460494e-06, + "loss": 0.7173, + "step": 1488 + }, + { + "epoch": 0.5497169579128722, + "grad_norm": 0.8947250843048096, + "learning_rate": 4.909273882828296e-06, + "loss": 0.7126, + "step": 1489 + }, + { + "epoch": 0.5500861432439085, + "grad_norm": 0.889103889465332, + "learning_rate": 4.909143998005842e-06, + "loss": 0.6949, + "step": 1490 + }, + { + "epoch": 0.5504553285749446, + "grad_norm": 0.8343292474746704, + "learning_rate": 4.909014021998049e-06, + "loss": 0.698, + "step": 1491 + }, + { + "epoch": 0.5508245139059808, + "grad_norm": 0.8784751892089844, + "learning_rate": 4.908883954809834e-06, + "loss": 0.6957, + "step": 1492 + }, + { + "epoch": 0.551193699237017, + "grad_norm": 0.9363612532615662, + "learning_rate": 4.908753796446123e-06, + "loss": 0.7385, + "step": 1493 + }, + { + "epoch": 0.5515628845680531, + "grad_norm": 0.899426281452179, + "learning_rate": 4.908623546911841e-06, + "loss": 0.7354, + "step": 1494 + }, + { + "epoch": 0.5519320698990894, + "grad_norm": 1.1256046295166016, + "learning_rate": 4.908493206211917e-06, + "loss": 0.7554, + "step": 1495 + }, + { + "epoch": 0.5523012552301255, + "grad_norm": 0.8998042941093445, + "learning_rate": 4.908362774351286e-06, + "loss": 0.718, + "step": 1496 + }, + { + "epoch": 0.5526704405611617, + "grad_norm": 0.9059179425239563, + "learning_rate": 4.908232251334884e-06, + "loss": 0.7193, + "step": 1497 + }, + { + "epoch": 0.5530396258921979, + "grad_norm": 0.8895880579948425, + "learning_rate": 4.90810163716765e-06, + "loss": 0.7431, + "step": 1498 + }, + { + "epoch": 0.553408811223234, + "grad_norm": 0.8827221393585205, + "learning_rate": 4.907970931854531e-06, + "loss": 0.7553, + "step": 1499 + }, + { + "epoch": 0.5537779965542703, + "grad_norm": 0.8968391418457031, + "learning_rate": 4.9078401354004715e-06, + "loss": 0.7487, + "step": 1500 + }, + { + "epoch": 0.5541471818853064, + "grad_norm": 0.8841264247894287, + "learning_rate": 4.907709247810422e-06, + "loss": 0.7482, + "step": 1501 + }, + { + "epoch": 0.5545163672163426, + "grad_norm": 0.8847429156303406, + "learning_rate": 4.907578269089338e-06, + "loss": 0.7099, + "step": 1502 + }, + { + "epoch": 0.5548855525473788, + "grad_norm": 0.892648458480835, + "learning_rate": 4.9074471992421765e-06, + "loss": 0.7092, + "step": 1503 + }, + { + "epoch": 0.555254737878415, + "grad_norm": 0.9089244604110718, + "learning_rate": 4.907316038273899e-06, + "loss": 0.7395, + "step": 1504 + }, + { + "epoch": 0.5556239232094512, + "grad_norm": 0.9008041620254517, + "learning_rate": 4.9071847861894684e-06, + "loss": 0.7522, + "step": 1505 + }, + { + "epoch": 0.5559931085404873, + "grad_norm": 1.0230878591537476, + "learning_rate": 4.907053442993853e-06, + "loss": 0.7571, + "step": 1506 + }, + { + "epoch": 0.5563622938715235, + "grad_norm": 0.9059250950813293, + "learning_rate": 4.906922008692025e-06, + "loss": 0.7478, + "step": 1507 + }, + { + "epoch": 0.5567314792025597, + "grad_norm": 0.8838487863540649, + "learning_rate": 4.906790483288958e-06, + "loss": 0.7608, + "step": 1508 + }, + { + "epoch": 0.5571006645335959, + "grad_norm": 0.879643440246582, + "learning_rate": 4.906658866789632e-06, + "loss": 0.7373, + "step": 1509 + }, + { + "epoch": 0.557469849864632, + "grad_norm": 0.9180140495300293, + "learning_rate": 4.906527159199027e-06, + "loss": 0.7187, + "step": 1510 + }, + { + "epoch": 0.5578390351956682, + "grad_norm": 0.8671844005584717, + "learning_rate": 4.906395360522128e-06, + "loss": 0.6626, + "step": 1511 + }, + { + "epoch": 0.5582082205267044, + "grad_norm": 0.9056374430656433, + "learning_rate": 4.9062634707639235e-06, + "loss": 0.7523, + "step": 1512 + }, + { + "epoch": 0.5585774058577406, + "grad_norm": 0.8730549812316895, + "learning_rate": 4.9061314899294074e-06, + "loss": 0.7356, + "step": 1513 + }, + { + "epoch": 0.5589465911887768, + "grad_norm": 0.8778785467147827, + "learning_rate": 4.905999418023574e-06, + "loss": 0.7071, + "step": 1514 + }, + { + "epoch": 0.5593157765198129, + "grad_norm": 0.8926696181297302, + "learning_rate": 4.905867255051421e-06, + "loss": 0.6818, + "step": 1515 + }, + { + "epoch": 0.5596849618508491, + "grad_norm": 0.8964526057243347, + "learning_rate": 4.905735001017952e-06, + "loss": 0.7376, + "step": 1516 + }, + { + "epoch": 0.5600541471818853, + "grad_norm": 0.8878997564315796, + "learning_rate": 4.905602655928172e-06, + "loss": 0.702, + "step": 1517 + }, + { + "epoch": 0.5604233325129215, + "grad_norm": 0.9055455327033997, + "learning_rate": 4.9054702197870905e-06, + "loss": 0.7591, + "step": 1518 + }, + { + "epoch": 0.5607925178439577, + "grad_norm": 0.8939942717552185, + "learning_rate": 4.9053376925997216e-06, + "loss": 0.7195, + "step": 1519 + }, + { + "epoch": 0.5611617031749938, + "grad_norm": 0.8912205696105957, + "learning_rate": 4.90520507437108e-06, + "loss": 0.7483, + "step": 1520 + }, + { + "epoch": 0.5615308885060301, + "grad_norm": 0.9732519388198853, + "learning_rate": 4.905072365106184e-06, + "loss": 0.7273, + "step": 1521 + }, + { + "epoch": 0.5619000738370662, + "grad_norm": 0.8627989888191223, + "learning_rate": 4.904939564810059e-06, + "loss": 0.7527, + "step": 1522 + }, + { + "epoch": 0.5622692591681024, + "grad_norm": 0.8785387277603149, + "learning_rate": 4.904806673487731e-06, + "loss": 0.7135, + "step": 1523 + }, + { + "epoch": 0.5626384444991386, + "grad_norm": 0.8822858929634094, + "learning_rate": 4.904673691144229e-06, + "loss": 0.725, + "step": 1524 + }, + { + "epoch": 0.5630076298301747, + "grad_norm": 0.8640886545181274, + "learning_rate": 4.904540617784587e-06, + "loss": 0.6923, + "step": 1525 + }, + { + "epoch": 0.563376815161211, + "grad_norm": 0.9395278096199036, + "learning_rate": 4.904407453413841e-06, + "loss": 0.7389, + "step": 1526 + }, + { + "epoch": 0.5637460004922471, + "grad_norm": 0.9424323439598083, + "learning_rate": 4.904274198037031e-06, + "loss": 0.7392, + "step": 1527 + }, + { + "epoch": 0.5641151858232832, + "grad_norm": 0.8736268281936646, + "learning_rate": 4.904140851659203e-06, + "loss": 0.689, + "step": 1528 + }, + { + "epoch": 0.5644843711543195, + "grad_norm": 0.9197478294372559, + "learning_rate": 4.904007414285401e-06, + "loss": 0.7496, + "step": 1529 + }, + { + "epoch": 0.5648535564853556, + "grad_norm": 0.8863821029663086, + "learning_rate": 4.903873885920678e-06, + "loss": 0.7162, + "step": 1530 + }, + { + "epoch": 0.5652227418163919, + "grad_norm": 0.9655389189720154, + "learning_rate": 4.903740266570087e-06, + "loss": 0.735, + "step": 1531 + }, + { + "epoch": 0.565591927147428, + "grad_norm": 0.8864624500274658, + "learning_rate": 4.903606556238686e-06, + "loss": 0.7066, + "step": 1532 + }, + { + "epoch": 0.5659611124784641, + "grad_norm": 0.8712696433067322, + "learning_rate": 4.9034727549315344e-06, + "loss": 0.7257, + "step": 1533 + }, + { + "epoch": 0.5663302978095004, + "grad_norm": 0.902729868888855, + "learning_rate": 4.903338862653698e-06, + "loss": 0.744, + "step": 1534 + }, + { + "epoch": 0.5666994831405365, + "grad_norm": 0.9051229953765869, + "learning_rate": 4.903204879410245e-06, + "loss": 0.6945, + "step": 1535 + }, + { + "epoch": 0.5670686684715728, + "grad_norm": 0.9622679352760315, + "learning_rate": 4.9030708052062445e-06, + "loss": 0.7472, + "step": 1536 + }, + { + "epoch": 0.5674378538026089, + "grad_norm": 0.9077664017677307, + "learning_rate": 4.902936640046772e-06, + "loss": 0.719, + "step": 1537 + }, + { + "epoch": 0.567807039133645, + "grad_norm": 0.8758202195167542, + "learning_rate": 4.902802383936908e-06, + "loss": 0.7191, + "step": 1538 + }, + { + "epoch": 0.5681762244646813, + "grad_norm": 0.8584937453269958, + "learning_rate": 4.902668036881731e-06, + "loss": 0.723, + "step": 1539 + }, + { + "epoch": 0.5685454097957174, + "grad_norm": 0.8474022746086121, + "learning_rate": 4.902533598886327e-06, + "loss": 0.6828, + "step": 1540 + }, + { + "epoch": 0.5689145951267536, + "grad_norm": 0.8970612287521362, + "learning_rate": 4.902399069955784e-06, + "loss": 0.7298, + "step": 1541 + }, + { + "epoch": 0.5692837804577898, + "grad_norm": 0.8726117014884949, + "learning_rate": 4.9022644500951956e-06, + "loss": 0.7043, + "step": 1542 + }, + { + "epoch": 0.569652965788826, + "grad_norm": 0.8593499660491943, + "learning_rate": 4.902129739309655e-06, + "loss": 0.7272, + "step": 1543 + }, + { + "epoch": 0.5700221511198622, + "grad_norm": 0.9041122794151306, + "learning_rate": 4.901994937604263e-06, + "loss": 0.7057, + "step": 1544 + }, + { + "epoch": 0.5703913364508983, + "grad_norm": 0.9077982306480408, + "learning_rate": 4.90186004498412e-06, + "loss": 0.7139, + "step": 1545 + }, + { + "epoch": 0.5707605217819345, + "grad_norm": 0.9217522740364075, + "learning_rate": 4.9017250614543326e-06, + "loss": 0.7491, + "step": 1546 + }, + { + "epoch": 0.5711297071129707, + "grad_norm": 0.95450758934021, + "learning_rate": 4.901589987020009e-06, + "loss": 0.7225, + "step": 1547 + }, + { + "epoch": 0.5714988924440069, + "grad_norm": 0.8575482368469238, + "learning_rate": 4.9014548216862635e-06, + "loss": 0.7052, + "step": 1548 + }, + { + "epoch": 0.5718680777750431, + "grad_norm": 0.8921974897384644, + "learning_rate": 4.90131956545821e-06, + "loss": 0.7423, + "step": 1549 + }, + { + "epoch": 0.5722372631060793, + "grad_norm": 0.9220647811889648, + "learning_rate": 4.901184218340969e-06, + "loss": 0.779, + "step": 1550 + }, + { + "epoch": 0.5726064484371154, + "grad_norm": 0.9192151427268982, + "learning_rate": 4.901048780339662e-06, + "loss": 0.7373, + "step": 1551 + }, + { + "epoch": 0.5729756337681516, + "grad_norm": 0.8914799690246582, + "learning_rate": 4.900913251459418e-06, + "loss": 0.7436, + "step": 1552 + }, + { + "epoch": 0.5733448190991878, + "grad_norm": 0.9210816621780396, + "learning_rate": 4.9007776317053654e-06, + "loss": 0.7246, + "step": 1553 + }, + { + "epoch": 0.573714004430224, + "grad_norm": 0.8826539516448975, + "learning_rate": 4.900641921082636e-06, + "loss": 0.6702, + "step": 1554 + }, + { + "epoch": 0.5740831897612602, + "grad_norm": 0.9123632907867432, + "learning_rate": 4.9005061195963686e-06, + "loss": 0.7505, + "step": 1555 + }, + { + "epoch": 0.5744523750922963, + "grad_norm": 0.8475764989852905, + "learning_rate": 4.900370227251702e-06, + "loss": 0.7349, + "step": 1556 + }, + { + "epoch": 0.5748215604233325, + "grad_norm": 0.9372847080230713, + "learning_rate": 4.900234244053778e-06, + "loss": 0.6943, + "step": 1557 + }, + { + "epoch": 0.5751907457543687, + "grad_norm": 0.8899771571159363, + "learning_rate": 4.900098170007748e-06, + "loss": 0.6975, + "step": 1558 + }, + { + "epoch": 0.5755599310854048, + "grad_norm": 0.9169413447380066, + "learning_rate": 4.899962005118759e-06, + "loss": 0.7258, + "step": 1559 + }, + { + "epoch": 0.5759291164164411, + "grad_norm": 0.9141312837600708, + "learning_rate": 4.899825749391965e-06, + "loss": 0.7496, + "step": 1560 + }, + { + "epoch": 0.5762983017474772, + "grad_norm": 0.8956618905067444, + "learning_rate": 4.8996894028325234e-06, + "loss": 0.7375, + "step": 1561 + }, + { + "epoch": 0.5766674870785135, + "grad_norm": 0.8596277236938477, + "learning_rate": 4.899552965445596e-06, + "loss": 0.7099, + "step": 1562 + }, + { + "epoch": 0.5770366724095496, + "grad_norm": 0.887212336063385, + "learning_rate": 4.899416437236346e-06, + "loss": 0.7221, + "step": 1563 + }, + { + "epoch": 0.5774058577405857, + "grad_norm": 0.9066047668457031, + "learning_rate": 4.8992798182099415e-06, + "loss": 0.7132, + "step": 1564 + }, + { + "epoch": 0.577775043071622, + "grad_norm": 0.8738695979118347, + "learning_rate": 4.899143108371552e-06, + "loss": 0.7055, + "step": 1565 + }, + { + "epoch": 0.5781442284026581, + "grad_norm": 0.87455153465271, + "learning_rate": 4.899006307726354e-06, + "loss": 0.7169, + "step": 1566 + }, + { + "epoch": 0.5785134137336944, + "grad_norm": 0.8736885786056519, + "learning_rate": 4.898869416279524e-06, + "loss": 0.7305, + "step": 1567 + }, + { + "epoch": 0.5788825990647305, + "grad_norm": 0.8731737732887268, + "learning_rate": 4.8987324340362445e-06, + "loss": 0.771, + "step": 1568 + }, + { + "epoch": 0.5792517843957666, + "grad_norm": 0.8779594898223877, + "learning_rate": 4.898595361001698e-06, + "loss": 0.7234, + "step": 1569 + }, + { + "epoch": 0.5796209697268029, + "grad_norm": 0.8702481985092163, + "learning_rate": 4.898458197181075e-06, + "loss": 0.728, + "step": 1570 + }, + { + "epoch": 0.579990155057839, + "grad_norm": 0.9149512052536011, + "learning_rate": 4.898320942579566e-06, + "loss": 0.7651, + "step": 1571 + }, + { + "epoch": 0.5803593403888753, + "grad_norm": 0.8694207072257996, + "learning_rate": 4.898183597202366e-06, + "loss": 0.7109, + "step": 1572 + }, + { + "epoch": 0.5807285257199114, + "grad_norm": 0.9014127850532532, + "learning_rate": 4.898046161054674e-06, + "loss": 0.7631, + "step": 1573 + }, + { + "epoch": 0.5810977110509475, + "grad_norm": 0.889441967010498, + "learning_rate": 4.897908634141692e-06, + "loss": 0.6963, + "step": 1574 + }, + { + "epoch": 0.5814668963819838, + "grad_norm": 0.8994700908660889, + "learning_rate": 4.897771016468624e-06, + "loss": 0.718, + "step": 1575 + }, + { + "epoch": 0.5818360817130199, + "grad_norm": 0.877490758895874, + "learning_rate": 4.897633308040681e-06, + "loss": 0.7188, + "step": 1576 + }, + { + "epoch": 0.5822052670440561, + "grad_norm": 0.8843386769294739, + "learning_rate": 4.8974955088630736e-06, + "loss": 0.7314, + "step": 1577 + }, + { + "epoch": 0.5825744523750923, + "grad_norm": 0.8549126386642456, + "learning_rate": 4.897357618941017e-06, + "loss": 0.7254, + "step": 1578 + }, + { + "epoch": 0.5829436377061284, + "grad_norm": 0.8546504378318787, + "learning_rate": 4.897219638279732e-06, + "loss": 0.737, + "step": 1579 + }, + { + "epoch": 0.5833128230371647, + "grad_norm": 0.8920966386795044, + "learning_rate": 4.89708156688444e-06, + "loss": 0.7096, + "step": 1580 + }, + { + "epoch": 0.5836820083682008, + "grad_norm": 0.8906827569007874, + "learning_rate": 4.896943404760368e-06, + "loss": 0.7314, + "step": 1581 + }, + { + "epoch": 0.584051193699237, + "grad_norm": 0.8761510252952576, + "learning_rate": 4.896805151912743e-06, + "loss": 0.6968, + "step": 1582 + }, + { + "epoch": 0.5844203790302732, + "grad_norm": 0.9379798173904419, + "learning_rate": 4.896666808346801e-06, + "loss": 0.7284, + "step": 1583 + }, + { + "epoch": 0.5847895643613094, + "grad_norm": 0.8771979212760925, + "learning_rate": 4.8965283740677765e-06, + "loss": 0.7416, + "step": 1584 + }, + { + "epoch": 0.5851587496923456, + "grad_norm": 0.8932775259017944, + "learning_rate": 4.896389849080908e-06, + "loss": 0.7203, + "step": 1585 + }, + { + "epoch": 0.5855279350233817, + "grad_norm": 0.9140869975090027, + "learning_rate": 4.8962512333914415e-06, + "loss": 0.7585, + "step": 1586 + }, + { + "epoch": 0.5858971203544179, + "grad_norm": 0.9230924844741821, + "learning_rate": 4.896112527004621e-06, + "loss": 0.7134, + "step": 1587 + }, + { + "epoch": 0.5862663056854541, + "grad_norm": 0.9974596500396729, + "learning_rate": 4.895973729925698e-06, + "loss": 0.6977, + "step": 1588 + }, + { + "epoch": 0.5866354910164903, + "grad_norm": 0.8764031529426575, + "learning_rate": 4.8958348421599255e-06, + "loss": 0.7449, + "step": 1589 + }, + { + "epoch": 0.5870046763475265, + "grad_norm": 0.9063859581947327, + "learning_rate": 4.895695863712561e-06, + "loss": 0.7296, + "step": 1590 + }, + { + "epoch": 0.5873738616785626, + "grad_norm": 0.9148220419883728, + "learning_rate": 4.895556794588864e-06, + "loss": 0.7396, + "step": 1591 + }, + { + "epoch": 0.5877430470095988, + "grad_norm": 0.8891001343727112, + "learning_rate": 4.895417634794098e-06, + "loss": 0.6993, + "step": 1592 + }, + { + "epoch": 0.588112232340635, + "grad_norm": 0.8979329466819763, + "learning_rate": 4.89527838433353e-06, + "loss": 0.7306, + "step": 1593 + }, + { + "epoch": 0.5884814176716712, + "grad_norm": 0.9050261974334717, + "learning_rate": 4.895139043212432e-06, + "loss": 0.7339, + "step": 1594 + }, + { + "epoch": 0.5888506030027073, + "grad_norm": 0.8708974719047546, + "learning_rate": 4.894999611436076e-06, + "loss": 0.7163, + "step": 1595 + }, + { + "epoch": 0.5892197883337436, + "grad_norm": 0.9276278018951416, + "learning_rate": 4.894860089009742e-06, + "loss": 0.728, + "step": 1596 + }, + { + "epoch": 0.5895889736647797, + "grad_norm": 1.0377492904663086, + "learning_rate": 4.894720475938709e-06, + "loss": 0.719, + "step": 1597 + }, + { + "epoch": 0.5899581589958159, + "grad_norm": 0.8987076282501221, + "learning_rate": 4.894580772228261e-06, + "loss": 0.7321, + "step": 1598 + }, + { + "epoch": 0.5903273443268521, + "grad_norm": 0.9341242909431458, + "learning_rate": 4.8944409778836874e-06, + "loss": 0.7379, + "step": 1599 + }, + { + "epoch": 0.5906965296578882, + "grad_norm": 0.9609735012054443, + "learning_rate": 4.894301092910278e-06, + "loss": 0.7361, + "step": 1600 + }, + { + "epoch": 0.5910657149889245, + "grad_norm": 0.8665148615837097, + "learning_rate": 4.8941611173133285e-06, + "loss": 0.7229, + "step": 1601 + }, + { + "epoch": 0.5914349003199606, + "grad_norm": 0.9139685034751892, + "learning_rate": 4.894021051098136e-06, + "loss": 0.7416, + "step": 1602 + }, + { + "epoch": 0.5918040856509968, + "grad_norm": 0.9231698513031006, + "learning_rate": 4.893880894270002e-06, + "loss": 0.7051, + "step": 1603 + }, + { + "epoch": 0.592173270982033, + "grad_norm": 0.8691010475158691, + "learning_rate": 4.893740646834232e-06, + "loss": 0.699, + "step": 1604 + }, + { + "epoch": 0.5925424563130691, + "grad_norm": 0.9221158623695374, + "learning_rate": 4.893600308796134e-06, + "loss": 0.745, + "step": 1605 + }, + { + "epoch": 0.5929116416441054, + "grad_norm": 0.86871337890625, + "learning_rate": 4.893459880161019e-06, + "loss": 0.7578, + "step": 1606 + }, + { + "epoch": 0.5932808269751415, + "grad_norm": 0.8679521679878235, + "learning_rate": 4.893319360934203e-06, + "loss": 0.728, + "step": 1607 + }, + { + "epoch": 0.5936500123061778, + "grad_norm": 0.9038932919502258, + "learning_rate": 4.893178751121006e-06, + "loss": 0.7258, + "step": 1608 + }, + { + "epoch": 0.5940191976372139, + "grad_norm": 0.890328049659729, + "learning_rate": 4.893038050726747e-06, + "loss": 0.7208, + "step": 1609 + }, + { + "epoch": 0.59438838296825, + "grad_norm": 0.9175987243652344, + "learning_rate": 4.892897259756753e-06, + "loss": 0.7468, + "step": 1610 + }, + { + "epoch": 0.5947575682992863, + "grad_norm": 0.9225278496742249, + "learning_rate": 4.892756378216354e-06, + "loss": 0.7379, + "step": 1611 + }, + { + "epoch": 0.5951267536303224, + "grad_norm": 0.8708120584487915, + "learning_rate": 4.8926154061108814e-06, + "loss": 0.7159, + "step": 1612 + }, + { + "epoch": 0.5954959389613586, + "grad_norm": 0.8939933776855469, + "learning_rate": 4.89247434344567e-06, + "loss": 0.7274, + "step": 1613 + }, + { + "epoch": 0.5958651242923948, + "grad_norm": 0.9112115502357483, + "learning_rate": 4.8923331902260604e-06, + "loss": 0.74, + "step": 1614 + }, + { + "epoch": 0.5962343096234309, + "grad_norm": 0.9555295705795288, + "learning_rate": 4.892191946457394e-06, + "loss": 0.7531, + "step": 1615 + }, + { + "epoch": 0.5966034949544672, + "grad_norm": 0.9047622680664062, + "learning_rate": 4.892050612145017e-06, + "loss": 0.763, + "step": 1616 + }, + { + "epoch": 0.5969726802855033, + "grad_norm": 0.9045296907424927, + "learning_rate": 4.8919091872942805e-06, + "loss": 0.6807, + "step": 1617 + }, + { + "epoch": 0.5973418656165395, + "grad_norm": 0.9008351564407349, + "learning_rate": 4.8917676719105355e-06, + "loss": 0.7378, + "step": 1618 + }, + { + "epoch": 0.5977110509475757, + "grad_norm": 0.8514592051506042, + "learning_rate": 4.891626065999139e-06, + "loss": 0.7158, + "step": 1619 + }, + { + "epoch": 0.5980802362786118, + "grad_norm": 0.9227151274681091, + "learning_rate": 4.8914843695654504e-06, + "loss": 0.7079, + "step": 1620 + }, + { + "epoch": 0.5984494216096481, + "grad_norm": 0.9016704559326172, + "learning_rate": 4.891342582614834e-06, + "loss": 0.7307, + "step": 1621 + }, + { + "epoch": 0.5988186069406842, + "grad_norm": 0.8876495957374573, + "learning_rate": 4.891200705152654e-06, + "loss": 0.7375, + "step": 1622 + }, + { + "epoch": 0.5991877922717204, + "grad_norm": 0.8613106608390808, + "learning_rate": 4.891058737184284e-06, + "loss": 0.7055, + "step": 1623 + }, + { + "epoch": 0.5995569776027566, + "grad_norm": 0.9127830862998962, + "learning_rate": 4.890916678715094e-06, + "loss": 0.7439, + "step": 1624 + }, + { + "epoch": 0.5999261629337928, + "grad_norm": 0.8781334161758423, + "learning_rate": 4.890774529750463e-06, + "loss": 0.7034, + "step": 1625 + }, + { + "epoch": 0.600295348264829, + "grad_norm": 0.8634480834007263, + "learning_rate": 4.890632290295771e-06, + "loss": 0.7129, + "step": 1626 + }, + { + "epoch": 0.6006645335958651, + "grad_norm": 0.8753401637077332, + "learning_rate": 4.8904899603564e-06, + "loss": 0.71, + "step": 1627 + }, + { + "epoch": 0.6010337189269013, + "grad_norm": 0.9049730896949768, + "learning_rate": 4.890347539937739e-06, + "loss": 0.7161, + "step": 1628 + }, + { + "epoch": 0.6014029042579375, + "grad_norm": 0.8901035785675049, + "learning_rate": 4.890205029045179e-06, + "loss": 0.7126, + "step": 1629 + }, + { + "epoch": 0.6017720895889737, + "grad_norm": 0.9000959396362305, + "learning_rate": 4.890062427684111e-06, + "loss": 0.6685, + "step": 1630 + }, + { + "epoch": 0.6021412749200098, + "grad_norm": 0.8935882449150085, + "learning_rate": 4.889919735859936e-06, + "loss": 0.693, + "step": 1631 + }, + { + "epoch": 0.602510460251046, + "grad_norm": 0.8670996427536011, + "learning_rate": 4.8897769535780525e-06, + "loss": 0.725, + "step": 1632 + }, + { + "epoch": 0.6028796455820822, + "grad_norm": 0.8828466534614563, + "learning_rate": 4.889634080843866e-06, + "loss": 0.7506, + "step": 1633 + }, + { + "epoch": 0.6032488309131184, + "grad_norm": 0.8849684596061707, + "learning_rate": 4.889491117662783e-06, + "loss": 0.7249, + "step": 1634 + }, + { + "epoch": 0.6036180162441546, + "grad_norm": 0.8891832828521729, + "learning_rate": 4.889348064040217e-06, + "loss": 0.7206, + "step": 1635 + }, + { + "epoch": 0.6039872015751907, + "grad_norm": 0.8497231602668762, + "learning_rate": 4.889204919981579e-06, + "loss": 0.6812, + "step": 1636 + }, + { + "epoch": 0.604356386906227, + "grad_norm": 0.872078537940979, + "learning_rate": 4.88906168549229e-06, + "loss": 0.7234, + "step": 1637 + }, + { + "epoch": 0.6047255722372631, + "grad_norm": 0.8662791848182678, + "learning_rate": 4.88891836057777e-06, + "loss": 0.7222, + "step": 1638 + }, + { + "epoch": 0.6050947575682993, + "grad_norm": 0.8954866528511047, + "learning_rate": 4.888774945243444e-06, + "loss": 0.7175, + "step": 1639 + }, + { + "epoch": 0.6054639428993355, + "grad_norm": 0.9128788709640503, + "learning_rate": 4.8886314394947396e-06, + "loss": 0.7185, + "step": 1640 + }, + { + "epoch": 0.6058331282303716, + "grad_norm": 0.9027896523475647, + "learning_rate": 4.888487843337089e-06, + "loss": 0.7559, + "step": 1641 + }, + { + "epoch": 0.6062023135614079, + "grad_norm": 0.868640124797821, + "learning_rate": 4.888344156775928e-06, + "loss": 0.7144, + "step": 1642 + }, + { + "epoch": 0.606571498892444, + "grad_norm": 0.9141109585762024, + "learning_rate": 4.888200379816695e-06, + "loss": 0.7381, + "step": 1643 + }, + { + "epoch": 0.6069406842234802, + "grad_norm": 0.875461757183075, + "learning_rate": 4.88805651246483e-06, + "loss": 0.7539, + "step": 1644 + }, + { + "epoch": 0.6073098695545164, + "grad_norm": 0.9165515303611755, + "learning_rate": 4.887912554725781e-06, + "loss": 0.7278, + "step": 1645 + }, + { + "epoch": 0.6076790548855525, + "grad_norm": 0.8719481825828552, + "learning_rate": 4.887768506604995e-06, + "loss": 0.7323, + "step": 1646 + }, + { + "epoch": 0.6080482402165888, + "grad_norm": 0.8756598830223083, + "learning_rate": 4.887624368107924e-06, + "loss": 0.7104, + "step": 1647 + }, + { + "epoch": 0.6084174255476249, + "grad_norm": 0.9028515219688416, + "learning_rate": 4.887480139240025e-06, + "loss": 0.731, + "step": 1648 + }, + { + "epoch": 0.608786610878661, + "grad_norm": 0.8773224353790283, + "learning_rate": 4.887335820006756e-06, + "loss": 0.7058, + "step": 1649 + }, + { + "epoch": 0.6091557962096973, + "grad_norm": 0.9026007652282715, + "learning_rate": 4.887191410413579e-06, + "loss": 0.6913, + "step": 1650 + }, + { + "epoch": 0.6095249815407334, + "grad_norm": 0.8898575305938721, + "learning_rate": 4.887046910465961e-06, + "loss": 0.7041, + "step": 1651 + }, + { + "epoch": 0.6098941668717697, + "grad_norm": 0.9111222624778748, + "learning_rate": 4.886902320169371e-06, + "loss": 0.7135, + "step": 1652 + }, + { + "epoch": 0.6102633522028058, + "grad_norm": 0.9058326482772827, + "learning_rate": 4.886757639529282e-06, + "loss": 0.6976, + "step": 1653 + }, + { + "epoch": 0.610632537533842, + "grad_norm": 0.8757637739181519, + "learning_rate": 4.886612868551168e-06, + "loss": 0.7507, + "step": 1654 + }, + { + "epoch": 0.6110017228648782, + "grad_norm": 0.910811722278595, + "learning_rate": 4.886468007240511e-06, + "loss": 0.757, + "step": 1655 + }, + { + "epoch": 0.6113709081959143, + "grad_norm": 0.897999107837677, + "learning_rate": 4.886323055602793e-06, + "loss": 0.7752, + "step": 1656 + }, + { + "epoch": 0.6117400935269506, + "grad_norm": 0.8987488746643066, + "learning_rate": 4.886178013643501e-06, + "loss": 0.7045, + "step": 1657 + }, + { + "epoch": 0.6121092788579867, + "grad_norm": 0.8889223337173462, + "learning_rate": 4.886032881368124e-06, + "loss": 0.6935, + "step": 1658 + }, + { + "epoch": 0.6124784641890229, + "grad_norm": 0.8837577700614929, + "learning_rate": 4.885887658782156e-06, + "loss": 0.6639, + "step": 1659 + }, + { + "epoch": 0.6128476495200591, + "grad_norm": 0.8779164552688599, + "learning_rate": 4.8857423458910925e-06, + "loss": 0.7181, + "step": 1660 + }, + { + "epoch": 0.6132168348510952, + "grad_norm": 0.9047713279724121, + "learning_rate": 4.885596942700434e-06, + "loss": 0.7417, + "step": 1661 + }, + { + "epoch": 0.6135860201821314, + "grad_norm": 0.8854183554649353, + "learning_rate": 4.885451449215685e-06, + "loss": 0.7511, + "step": 1662 + }, + { + "epoch": 0.6139552055131676, + "grad_norm": 0.9122277498245239, + "learning_rate": 4.88530586544235e-06, + "loss": 0.7472, + "step": 1663 + }, + { + "epoch": 0.6143243908442038, + "grad_norm": 0.8803077340126038, + "learning_rate": 4.885160191385942e-06, + "loss": 0.7052, + "step": 1664 + }, + { + "epoch": 0.61469357617524, + "grad_norm": 0.878976047039032, + "learning_rate": 4.885014427051973e-06, + "loss": 0.7416, + "step": 1665 + }, + { + "epoch": 0.6150627615062761, + "grad_norm": 0.8740445375442505, + "learning_rate": 4.884868572445961e-06, + "loss": 0.6892, + "step": 1666 + }, + { + "epoch": 0.6154319468373123, + "grad_norm": 0.8993122577667236, + "learning_rate": 4.884722627573426e-06, + "loss": 0.7153, + "step": 1667 + }, + { + "epoch": 0.6158011321683485, + "grad_norm": 0.8919605016708374, + "learning_rate": 4.884576592439893e-06, + "loss": 0.7189, + "step": 1668 + }, + { + "epoch": 0.6161703174993847, + "grad_norm": 0.8694654107093811, + "learning_rate": 4.884430467050887e-06, + "loss": 0.7068, + "step": 1669 + }, + { + "epoch": 0.6165395028304209, + "grad_norm": 0.8842293620109558, + "learning_rate": 4.884284251411941e-06, + "loss": 0.7464, + "step": 1670 + }, + { + "epoch": 0.616908688161457, + "grad_norm": 0.9038980007171631, + "learning_rate": 4.884137945528589e-06, + "loss": 0.7447, + "step": 1671 + }, + { + "epoch": 0.6172778734924932, + "grad_norm": 0.8944399356842041, + "learning_rate": 4.883991549406368e-06, + "loss": 0.7608, + "step": 1672 + }, + { + "epoch": 0.6176470588235294, + "grad_norm": 0.8438637256622314, + "learning_rate": 4.883845063050819e-06, + "loss": 0.7548, + "step": 1673 + }, + { + "epoch": 0.6180162441545656, + "grad_norm": 0.8841381669044495, + "learning_rate": 4.883698486467487e-06, + "loss": 0.7395, + "step": 1674 + }, + { + "epoch": 0.6183854294856018, + "grad_norm": 0.8783007860183716, + "learning_rate": 4.883551819661919e-06, + "loss": 0.7002, + "step": 1675 + }, + { + "epoch": 0.618754614816638, + "grad_norm": 0.8673411011695862, + "learning_rate": 4.883405062639668e-06, + "loss": 0.6778, + "step": 1676 + }, + { + "epoch": 0.6191238001476741, + "grad_norm": 0.9229005575180054, + "learning_rate": 4.883258215406287e-06, + "loss": 0.7444, + "step": 1677 + }, + { + "epoch": 0.6194929854787103, + "grad_norm": 0.8770948052406311, + "learning_rate": 4.883111277967334e-06, + "loss": 0.7174, + "step": 1678 + }, + { + "epoch": 0.6198621708097465, + "grad_norm": 0.8880107998847961, + "learning_rate": 4.882964250328373e-06, + "loss": 0.7353, + "step": 1679 + }, + { + "epoch": 0.6202313561407826, + "grad_norm": 0.8703299760818481, + "learning_rate": 4.882817132494966e-06, + "loss": 0.7469, + "step": 1680 + }, + { + "epoch": 0.6206005414718189, + "grad_norm": 0.8967667818069458, + "learning_rate": 4.882669924472682e-06, + "loss": 0.702, + "step": 1681 + }, + { + "epoch": 0.620969726802855, + "grad_norm": 0.9066647291183472, + "learning_rate": 4.882522626267094e-06, + "loss": 0.6851, + "step": 1682 + }, + { + "epoch": 0.6213389121338913, + "grad_norm": 0.8500033020973206, + "learning_rate": 4.882375237883777e-06, + "loss": 0.74, + "step": 1683 + }, + { + "epoch": 0.6217080974649274, + "grad_norm": 0.9138725996017456, + "learning_rate": 4.882227759328308e-06, + "loss": 0.7305, + "step": 1684 + }, + { + "epoch": 0.6220772827959635, + "grad_norm": 0.8820671439170837, + "learning_rate": 4.882080190606271e-06, + "loss": 0.6959, + "step": 1685 + }, + { + "epoch": 0.6224464681269998, + "grad_norm": 0.8790507316589355, + "learning_rate": 4.881932531723251e-06, + "loss": 0.7276, + "step": 1686 + }, + { + "epoch": 0.6228156534580359, + "grad_norm": 0.880133330821991, + "learning_rate": 4.881784782684835e-06, + "loss": 0.7213, + "step": 1687 + }, + { + "epoch": 0.6231848387890722, + "grad_norm": 0.9163568615913391, + "learning_rate": 4.881636943496618e-06, + "loss": 0.7214, + "step": 1688 + }, + { + "epoch": 0.6235540241201083, + "grad_norm": 0.8676769733428955, + "learning_rate": 4.881489014164194e-06, + "loss": 0.7168, + "step": 1689 + }, + { + "epoch": 0.6239232094511444, + "grad_norm": 0.9037620425224304, + "learning_rate": 4.881340994693162e-06, + "loss": 0.7092, + "step": 1690 + }, + { + "epoch": 0.6242923947821807, + "grad_norm": 0.8854486346244812, + "learning_rate": 4.881192885089125e-06, + "loss": 0.6896, + "step": 1691 + }, + { + "epoch": 0.6246615801132168, + "grad_norm": 0.904160737991333, + "learning_rate": 4.88104468535769e-06, + "loss": 0.7349, + "step": 1692 + }, + { + "epoch": 0.6250307654442531, + "grad_norm": 0.8994008898735046, + "learning_rate": 4.880896395504464e-06, + "loss": 0.7331, + "step": 1693 + }, + { + "epoch": 0.6253999507752892, + "grad_norm": 0.8491187691688538, + "learning_rate": 4.8807480155350605e-06, + "loss": 0.7185, + "step": 1694 + }, + { + "epoch": 0.6257691361063253, + "grad_norm": 0.8684033155441284, + "learning_rate": 4.880599545455097e-06, + "loss": 0.7089, + "step": 1695 + }, + { + "epoch": 0.6261383214373616, + "grad_norm": 0.9083407521247864, + "learning_rate": 4.880450985270191e-06, + "loss": 0.7018, + "step": 1696 + }, + { + "epoch": 0.6265075067683977, + "grad_norm": 0.9744650721549988, + "learning_rate": 4.880302334985967e-06, + "loss": 0.7175, + "step": 1697 + }, + { + "epoch": 0.6268766920994339, + "grad_norm": 0.9108169078826904, + "learning_rate": 4.880153594608051e-06, + "loss": 0.7127, + "step": 1698 + }, + { + "epoch": 0.6272458774304701, + "grad_norm": 0.8855581879615784, + "learning_rate": 4.880004764142073e-06, + "loss": 0.7121, + "step": 1699 + }, + { + "epoch": 0.6276150627615062, + "grad_norm": 0.8942852020263672, + "learning_rate": 4.879855843593665e-06, + "loss": 0.6918, + "step": 1700 + }, + { + "epoch": 0.6279842480925425, + "grad_norm": 0.9338670969009399, + "learning_rate": 4.879706832968465e-06, + "loss": 0.7297, + "step": 1701 + }, + { + "epoch": 0.6283534334235786, + "grad_norm": 0.8706702589988708, + "learning_rate": 4.879557732272112e-06, + "loss": 0.6909, + "step": 1702 + }, + { + "epoch": 0.6287226187546148, + "grad_norm": 0.8873736262321472, + "learning_rate": 4.87940854151025e-06, + "loss": 0.7814, + "step": 1703 + }, + { + "epoch": 0.629091804085651, + "grad_norm": 0.8286136984825134, + "learning_rate": 4.879259260688526e-06, + "loss": 0.6852, + "step": 1704 + }, + { + "epoch": 0.6294609894166872, + "grad_norm": 0.8823638558387756, + "learning_rate": 4.879109889812589e-06, + "loss": 0.7283, + "step": 1705 + }, + { + "epoch": 0.6298301747477234, + "grad_norm": 0.8702864646911621, + "learning_rate": 4.878960428888094e-06, + "loss": 0.7124, + "step": 1706 + }, + { + "epoch": 0.6301993600787595, + "grad_norm": 0.877116858959198, + "learning_rate": 4.878810877920698e-06, + "loss": 0.734, + "step": 1707 + }, + { + "epoch": 0.6305685454097957, + "grad_norm": 0.8908865451812744, + "learning_rate": 4.878661236916061e-06, + "loss": 0.7197, + "step": 1708 + }, + { + "epoch": 0.6309377307408319, + "grad_norm": 0.8861148357391357, + "learning_rate": 4.878511505879846e-06, + "loss": 0.6981, + "step": 1709 + }, + { + "epoch": 0.6313069160718681, + "grad_norm": 0.8963793516159058, + "learning_rate": 4.8783616848177215e-06, + "loss": 0.7136, + "step": 1710 + }, + { + "epoch": 0.6316761014029043, + "grad_norm": 0.9019988775253296, + "learning_rate": 4.878211773735359e-06, + "loss": 0.7063, + "step": 1711 + }, + { + "epoch": 0.6320452867339404, + "grad_norm": 0.9044898748397827, + "learning_rate": 4.8780617726384305e-06, + "loss": 0.7491, + "step": 1712 + }, + { + "epoch": 0.6324144720649766, + "grad_norm": 0.8985450863838196, + "learning_rate": 4.877911681532614e-06, + "loss": 0.6868, + "step": 1713 + }, + { + "epoch": 0.6327836573960128, + "grad_norm": 0.931446373462677, + "learning_rate": 4.877761500423591e-06, + "loss": 0.7363, + "step": 1714 + }, + { + "epoch": 0.633152842727049, + "grad_norm": 0.916556715965271, + "learning_rate": 4.877611229317047e-06, + "loss": 0.773, + "step": 1715 + }, + { + "epoch": 0.6335220280580851, + "grad_norm": 0.8990119695663452, + "learning_rate": 4.877460868218667e-06, + "loss": 0.6959, + "step": 1716 + }, + { + "epoch": 0.6338912133891214, + "grad_norm": 0.8884509205818176, + "learning_rate": 4.877310417134144e-06, + "loss": 0.7187, + "step": 1717 + }, + { + "epoch": 0.6342603987201575, + "grad_norm": 0.8993596434593201, + "learning_rate": 4.8771598760691715e-06, + "loss": 0.7331, + "step": 1718 + }, + { + "epoch": 0.6346295840511937, + "grad_norm": 0.8999956846237183, + "learning_rate": 4.877009245029448e-06, + "loss": 0.7349, + "step": 1719 + }, + { + "epoch": 0.6349987693822299, + "grad_norm": 0.8785694241523743, + "learning_rate": 4.876858524020675e-06, + "loss": 0.689, + "step": 1720 + }, + { + "epoch": 0.635367954713266, + "grad_norm": 0.8470606803894043, + "learning_rate": 4.876707713048558e-06, + "loss": 0.676, + "step": 1721 + }, + { + "epoch": 0.6357371400443023, + "grad_norm": 0.8418689966201782, + "learning_rate": 4.876556812118802e-06, + "loss": 0.709, + "step": 1722 + }, + { + "epoch": 0.6361063253753384, + "grad_norm": 0.8840335011482239, + "learning_rate": 4.876405821237122e-06, + "loss": 0.7133, + "step": 1723 + }, + { + "epoch": 0.6364755107063746, + "grad_norm": 0.8696883916854858, + "learning_rate": 4.876254740409232e-06, + "loss": 0.6845, + "step": 1724 + }, + { + "epoch": 0.6368446960374108, + "grad_norm": 0.8984381556510925, + "learning_rate": 4.876103569640849e-06, + "loss": 0.7111, + "step": 1725 + }, + { + "epoch": 0.6372138813684469, + "grad_norm": 0.8952850699424744, + "learning_rate": 4.875952308937697e-06, + "loss": 0.7383, + "step": 1726 + }, + { + "epoch": 0.6375830666994832, + "grad_norm": 0.8560416102409363, + "learning_rate": 4.875800958305499e-06, + "loss": 0.6988, + "step": 1727 + }, + { + "epoch": 0.6379522520305193, + "grad_norm": 0.8600884079933167, + "learning_rate": 4.875649517749985e-06, + "loss": 0.6895, + "step": 1728 + }, + { + "epoch": 0.6383214373615556, + "grad_norm": 0.9282815456390381, + "learning_rate": 4.875497987276886e-06, + "loss": 0.7253, + "step": 1729 + }, + { + "epoch": 0.6386906226925917, + "grad_norm": 0.8954489827156067, + "learning_rate": 4.875346366891939e-06, + "loss": 0.7638, + "step": 1730 + }, + { + "epoch": 0.6390598080236278, + "grad_norm": 0.8884443640708923, + "learning_rate": 4.875194656600881e-06, + "loss": 0.7284, + "step": 1731 + }, + { + "epoch": 0.6394289933546641, + "grad_norm": 0.8890431523323059, + "learning_rate": 4.875042856409454e-06, + "loss": 0.7393, + "step": 1732 + }, + { + "epoch": 0.6397981786857002, + "grad_norm": 0.8974068760871887, + "learning_rate": 4.874890966323406e-06, + "loss": 0.7329, + "step": 1733 + }, + { + "epoch": 0.6401673640167364, + "grad_norm": 0.8659381866455078, + "learning_rate": 4.874738986348484e-06, + "loss": 0.7035, + "step": 1734 + }, + { + "epoch": 0.6405365493477726, + "grad_norm": 0.8336740732192993, + "learning_rate": 4.87458691649044e-06, + "loss": 0.6696, + "step": 1735 + }, + { + "epoch": 0.6409057346788087, + "grad_norm": 0.8795167207717896, + "learning_rate": 4.874434756755032e-06, + "loss": 0.7289, + "step": 1736 + }, + { + "epoch": 0.641274920009845, + "grad_norm": 0.8558307886123657, + "learning_rate": 4.874282507148017e-06, + "loss": 0.7214, + "step": 1737 + }, + { + "epoch": 0.6416441053408811, + "grad_norm": 0.8652957677841187, + "learning_rate": 4.8741301676751584e-06, + "loss": 0.7629, + "step": 1738 + }, + { + "epoch": 0.6420132906719173, + "grad_norm": 0.8892576098442078, + "learning_rate": 4.873977738342222e-06, + "loss": 0.7168, + "step": 1739 + }, + { + "epoch": 0.6423824760029535, + "grad_norm": 0.8710838556289673, + "learning_rate": 4.873825219154978e-06, + "loss": 0.6837, + "step": 1740 + }, + { + "epoch": 0.6427516613339896, + "grad_norm": 0.8877007365226746, + "learning_rate": 4.873672610119199e-06, + "loss": 0.6765, + "step": 1741 + }, + { + "epoch": 0.6431208466650259, + "grad_norm": 0.8830922842025757, + "learning_rate": 4.87351991124066e-06, + "loss": 0.6797, + "step": 1742 + }, + { + "epoch": 0.643490031996062, + "grad_norm": 0.8394154906272888, + "learning_rate": 4.873367122525142e-06, + "loss": 0.666, + "step": 1743 + }, + { + "epoch": 0.6438592173270982, + "grad_norm": 0.9082286953926086, + "learning_rate": 4.873214243978427e-06, + "loss": 0.6855, + "step": 1744 + }, + { + "epoch": 0.6442284026581344, + "grad_norm": 0.869299054145813, + "learning_rate": 4.873061275606302e-06, + "loss": 0.7026, + "step": 1745 + }, + { + "epoch": 0.6445975879891706, + "grad_norm": 0.8861920833587646, + "learning_rate": 4.872908217414557e-06, + "loss": 0.7525, + "step": 1746 + }, + { + "epoch": 0.6449667733202068, + "grad_norm": 0.9306628704071045, + "learning_rate": 4.8727550694089845e-06, + "loss": 0.7314, + "step": 1747 + }, + { + "epoch": 0.6453359586512429, + "grad_norm": 0.8352028131484985, + "learning_rate": 4.872601831595381e-06, + "loss": 0.6876, + "step": 1748 + }, + { + "epoch": 0.6457051439822791, + "grad_norm": 0.91121506690979, + "learning_rate": 4.872448503979548e-06, + "loss": 0.6885, + "step": 1749 + }, + { + "epoch": 0.6460743293133153, + "grad_norm": 0.9285972714424133, + "learning_rate": 4.872295086567288e-06, + "loss": 0.7105, + "step": 1750 + }, + { + "epoch": 0.6464435146443515, + "grad_norm": 0.914553701877594, + "learning_rate": 4.872141579364407e-06, + "loss": 0.7032, + "step": 1751 + }, + { + "epoch": 0.6468126999753876, + "grad_norm": 0.8394815325737, + "learning_rate": 4.871987982376716e-06, + "loss": 0.6784, + "step": 1752 + }, + { + "epoch": 0.6471818853064238, + "grad_norm": 0.8877943754196167, + "learning_rate": 4.871834295610028e-06, + "loss": 0.7183, + "step": 1753 + }, + { + "epoch": 0.64755107063746, + "grad_norm": 0.8248438239097595, + "learning_rate": 4.871680519070162e-06, + "loss": 0.6573, + "step": 1754 + }, + { + "epoch": 0.6479202559684962, + "grad_norm": 0.88385409116745, + "learning_rate": 4.871526652762936e-06, + "loss": 0.7206, + "step": 1755 + }, + { + "epoch": 0.6482894412995324, + "grad_norm": 0.8734132647514343, + "learning_rate": 4.8713726966941745e-06, + "loss": 0.7712, + "step": 1756 + }, + { + "epoch": 0.6486586266305685, + "grad_norm": 0.8956114649772644, + "learning_rate": 4.871218650869704e-06, + "loss": 0.7221, + "step": 1757 + }, + { + "epoch": 0.6490278119616048, + "grad_norm": 0.9125884771347046, + "learning_rate": 4.871064515295357e-06, + "loss": 0.7374, + "step": 1758 + }, + { + "epoch": 0.6493969972926409, + "grad_norm": 0.8915518522262573, + "learning_rate": 4.870910289976967e-06, + "loss": 0.7126, + "step": 1759 + }, + { + "epoch": 0.6497661826236771, + "grad_norm": 0.9576418995857239, + "learning_rate": 4.870755974920369e-06, + "loss": 0.7538, + "step": 1760 + }, + { + "epoch": 0.6501353679547133, + "grad_norm": 0.8745632171630859, + "learning_rate": 4.870601570131407e-06, + "loss": 0.6837, + "step": 1761 + }, + { + "epoch": 0.6505045532857494, + "grad_norm": 0.9216246604919434, + "learning_rate": 4.870447075615923e-06, + "loss": 0.7076, + "step": 1762 + }, + { + "epoch": 0.6508737386167857, + "grad_norm": 0.8735246658325195, + "learning_rate": 4.870292491379765e-06, + "loss": 0.6905, + "step": 1763 + }, + { + "epoch": 0.6512429239478218, + "grad_norm": 0.8984786868095398, + "learning_rate": 4.870137817428786e-06, + "loss": 0.6877, + "step": 1764 + }, + { + "epoch": 0.6516121092788579, + "grad_norm": 0.8973082304000854, + "learning_rate": 4.869983053768838e-06, + "loss": 0.715, + "step": 1765 + }, + { + "epoch": 0.6519812946098942, + "grad_norm": 0.8570342063903809, + "learning_rate": 4.869828200405778e-06, + "loss": 0.6751, + "step": 1766 + }, + { + "epoch": 0.6523504799409303, + "grad_norm": 0.8807101845741272, + "learning_rate": 4.86967325734547e-06, + "loss": 0.7177, + "step": 1767 + }, + { + "epoch": 0.6527196652719666, + "grad_norm": 0.8817850947380066, + "learning_rate": 4.869518224593777e-06, + "loss": 0.7227, + "step": 1768 + }, + { + "epoch": 0.6530888506030027, + "grad_norm": 0.9274417161941528, + "learning_rate": 4.869363102156566e-06, + "loss": 0.7644, + "step": 1769 + }, + { + "epoch": 0.6534580359340388, + "grad_norm": 0.8698463439941406, + "learning_rate": 4.86920789003971e-06, + "loss": 0.69, + "step": 1770 + }, + { + "epoch": 0.6538272212650751, + "grad_norm": 0.9024192690849304, + "learning_rate": 4.869052588249083e-06, + "loss": 0.6817, + "step": 1771 + }, + { + "epoch": 0.6541964065961112, + "grad_norm": 0.8906611800193787, + "learning_rate": 4.868897196790563e-06, + "loss": 0.7094, + "step": 1772 + }, + { + "epoch": 0.6545655919271475, + "grad_norm": 0.9051023125648499, + "learning_rate": 4.868741715670032e-06, + "loss": 0.7047, + "step": 1773 + }, + { + "epoch": 0.6549347772581836, + "grad_norm": 0.8683571219444275, + "learning_rate": 4.868586144893375e-06, + "loss": 0.6969, + "step": 1774 + }, + { + "epoch": 0.6553039625892197, + "grad_norm": 0.8832207322120667, + "learning_rate": 4.8684304844664796e-06, + "loss": 0.7062, + "step": 1775 + }, + { + "epoch": 0.655673147920256, + "grad_norm": 0.8671314716339111, + "learning_rate": 4.868274734395238e-06, + "loss": 0.7214, + "step": 1776 + }, + { + "epoch": 0.6560423332512921, + "grad_norm": 0.9176437258720398, + "learning_rate": 4.8681188946855454e-06, + "loss": 0.7633, + "step": 1777 + }, + { + "epoch": 0.6564115185823284, + "grad_norm": 0.9051275849342346, + "learning_rate": 4.867962965343299e-06, + "loss": 0.7361, + "step": 1778 + }, + { + "epoch": 0.6567807039133645, + "grad_norm": 0.8660034537315369, + "learning_rate": 4.867806946374403e-06, + "loss": 0.6905, + "step": 1779 + }, + { + "epoch": 0.6571498892444007, + "grad_norm": 0.8938033580780029, + "learning_rate": 4.86765083778476e-06, + "loss": 0.7329, + "step": 1780 + }, + { + "epoch": 0.6575190745754369, + "grad_norm": 0.8933357000350952, + "learning_rate": 4.867494639580281e-06, + "loss": 0.7153, + "step": 1781 + }, + { + "epoch": 0.657888259906473, + "grad_norm": 0.8628551363945007, + "learning_rate": 4.867338351766877e-06, + "loss": 0.7198, + "step": 1782 + }, + { + "epoch": 0.6582574452375092, + "grad_norm": 0.8424699306488037, + "learning_rate": 4.867181974350463e-06, + "loss": 0.6751, + "step": 1783 + }, + { + "epoch": 0.6586266305685454, + "grad_norm": 0.8781000375747681, + "learning_rate": 4.867025507336959e-06, + "loss": 0.74, + "step": 1784 + }, + { + "epoch": 0.6589958158995816, + "grad_norm": 0.8817055821418762, + "learning_rate": 4.866868950732286e-06, + "loss": 0.6806, + "step": 1785 + }, + { + "epoch": 0.6593650012306178, + "grad_norm": 0.89399254322052, + "learning_rate": 4.8667123045423705e-06, + "loss": 0.7123, + "step": 1786 + }, + { + "epoch": 0.659734186561654, + "grad_norm": 0.8942681550979614, + "learning_rate": 4.866555568773141e-06, + "loss": 0.7426, + "step": 1787 + }, + { + "epoch": 0.6601033718926901, + "grad_norm": 0.8757315278053284, + "learning_rate": 4.866398743430531e-06, + "loss": 0.7476, + "step": 1788 + }, + { + "epoch": 0.6604725572237263, + "grad_norm": 0.8564402461051941, + "learning_rate": 4.866241828520475e-06, + "loss": 0.6926, + "step": 1789 + }, + { + "epoch": 0.6608417425547625, + "grad_norm": 0.8761123418807983, + "learning_rate": 4.866084824048913e-06, + "loss": 0.7061, + "step": 1790 + }, + { + "epoch": 0.6612109278857987, + "grad_norm": 0.8826016783714294, + "learning_rate": 4.8659277300217856e-06, + "loss": 0.6939, + "step": 1791 + }, + { + "epoch": 0.6615801132168349, + "grad_norm": 0.8721649646759033, + "learning_rate": 4.865770546445041e-06, + "loss": 0.727, + "step": 1792 + }, + { + "epoch": 0.661949298547871, + "grad_norm": 0.91274493932724, + "learning_rate": 4.865613273324629e-06, + "loss": 0.7393, + "step": 1793 + }, + { + "epoch": 0.6623184838789072, + "grad_norm": 0.8830939531326294, + "learning_rate": 4.8654559106665e-06, + "loss": 0.687, + "step": 1794 + }, + { + "epoch": 0.6626876692099434, + "grad_norm": 0.8717511892318726, + "learning_rate": 4.865298458476612e-06, + "loss": 0.6921, + "step": 1795 + }, + { + "epoch": 0.6630568545409796, + "grad_norm": 0.8554193377494812, + "learning_rate": 4.865140916760923e-06, + "loss": 0.6921, + "step": 1796 + }, + { + "epoch": 0.6634260398720158, + "grad_norm": 0.8773930668830872, + "learning_rate": 4.864983285525397e-06, + "loss": 0.7317, + "step": 1797 + }, + { + "epoch": 0.6637952252030519, + "grad_norm": 0.8963366150856018, + "learning_rate": 4.864825564776e-06, + "loss": 0.7034, + "step": 1798 + }, + { + "epoch": 0.6641644105340881, + "grad_norm": 0.8847402334213257, + "learning_rate": 4.864667754518702e-06, + "loss": 0.7242, + "step": 1799 + }, + { + "epoch": 0.6645335958651243, + "grad_norm": 0.8819407224655151, + "learning_rate": 4.864509854759476e-06, + "loss": 0.7329, + "step": 1800 + }, + { + "epoch": 0.6649027811961604, + "grad_norm": 0.8914307951927185, + "learning_rate": 4.864351865504298e-06, + "loss": 0.7128, + "step": 1801 + }, + { + "epoch": 0.6652719665271967, + "grad_norm": 0.8843200206756592, + "learning_rate": 4.864193786759148e-06, + "loss": 0.7198, + "step": 1802 + }, + { + "epoch": 0.6656411518582328, + "grad_norm": 0.8661046028137207, + "learning_rate": 4.8640356185300094e-06, + "loss": 0.7281, + "step": 1803 + }, + { + "epoch": 0.666010337189269, + "grad_norm": 0.8948192596435547, + "learning_rate": 4.863877360822869e-06, + "loss": 0.7508, + "step": 1804 + }, + { + "epoch": 0.6663795225203052, + "grad_norm": 0.8964792490005493, + "learning_rate": 4.863719013643716e-06, + "loss": 0.7193, + "step": 1805 + }, + { + "epoch": 0.6667487078513413, + "grad_norm": 0.8771764039993286, + "learning_rate": 4.863560576998545e-06, + "loss": 0.7127, + "step": 1806 + }, + { + "epoch": 0.6671178931823776, + "grad_norm": 0.8624410033226013, + "learning_rate": 4.8634020508933524e-06, + "loss": 0.6808, + "step": 1807 + }, + { + "epoch": 0.6674870785134137, + "grad_norm": 0.901315450668335, + "learning_rate": 4.863243435334137e-06, + "loss": 0.6907, + "step": 1808 + }, + { + "epoch": 0.66785626384445, + "grad_norm": 0.8859581351280212, + "learning_rate": 4.8630847303269034e-06, + "loss": 0.7351, + "step": 1809 + }, + { + "epoch": 0.6682254491754861, + "grad_norm": 0.9065508842468262, + "learning_rate": 4.862925935877659e-06, + "loss": 0.7049, + "step": 1810 + }, + { + "epoch": 0.6685946345065222, + "grad_norm": 0.8637186288833618, + "learning_rate": 4.8627670519924146e-06, + "loss": 0.702, + "step": 1811 + }, + { + "epoch": 0.6689638198375585, + "grad_norm": 0.8939265012741089, + "learning_rate": 4.862608078677181e-06, + "loss": 0.7209, + "step": 1812 + }, + { + "epoch": 0.6693330051685946, + "grad_norm": 0.9074069857597351, + "learning_rate": 4.862449015937979e-06, + "loss": 0.7215, + "step": 1813 + }, + { + "epoch": 0.6697021904996309, + "grad_norm": 0.898177444934845, + "learning_rate": 4.8622898637808265e-06, + "loss": 0.7341, + "step": 1814 + }, + { + "epoch": 0.670071375830667, + "grad_norm": 0.8927603960037231, + "learning_rate": 4.862130622211749e-06, + "loss": 0.7372, + "step": 1815 + }, + { + "epoch": 0.6704405611617031, + "grad_norm": 0.9042506814002991, + "learning_rate": 4.861971291236772e-06, + "loss": 0.6699, + "step": 1816 + }, + { + "epoch": 0.6708097464927394, + "grad_norm": 0.867923378944397, + "learning_rate": 4.861811870861928e-06, + "loss": 0.7065, + "step": 1817 + }, + { + "epoch": 0.6711789318237755, + "grad_norm": 0.8792146444320679, + "learning_rate": 4.861652361093249e-06, + "loss": 0.7359, + "step": 1818 + }, + { + "epoch": 0.6715481171548117, + "grad_norm": 0.8699401021003723, + "learning_rate": 4.861492761936774e-06, + "loss": 0.73, + "step": 1819 + }, + { + "epoch": 0.6719173024858479, + "grad_norm": 0.9019505381584167, + "learning_rate": 4.861333073398543e-06, + "loss": 0.7142, + "step": 1820 + }, + { + "epoch": 0.672286487816884, + "grad_norm": 0.8983092904090881, + "learning_rate": 4.8611732954846015e-06, + "loss": 0.7232, + "step": 1821 + }, + { + "epoch": 0.6726556731479203, + "grad_norm": 0.8775074481964111, + "learning_rate": 4.861013428200995e-06, + "loss": 0.7051, + "step": 1822 + }, + { + "epoch": 0.6730248584789564, + "grad_norm": 0.8952973484992981, + "learning_rate": 4.8608534715537755e-06, + "loss": 0.7544, + "step": 1823 + }, + { + "epoch": 0.6733940438099926, + "grad_norm": 0.857286274433136, + "learning_rate": 4.860693425548997e-06, + "loss": 0.6949, + "step": 1824 + }, + { + "epoch": 0.6737632291410288, + "grad_norm": 0.850751519203186, + "learning_rate": 4.8605332901927175e-06, + "loss": 0.7082, + "step": 1825 + }, + { + "epoch": 0.674132414472065, + "grad_norm": 0.872599720954895, + "learning_rate": 4.860373065490998e-06, + "loss": 0.7073, + "step": 1826 + }, + { + "epoch": 0.6745015998031012, + "grad_norm": 0.9076393842697144, + "learning_rate": 4.860212751449903e-06, + "loss": 0.6977, + "step": 1827 + }, + { + "epoch": 0.6748707851341373, + "grad_norm": 0.8853808045387268, + "learning_rate": 4.8600523480755e-06, + "loss": 0.6811, + "step": 1828 + }, + { + "epoch": 0.6752399704651735, + "grad_norm": 0.8611722588539124, + "learning_rate": 4.859891855373861e-06, + "loss": 0.7209, + "step": 1829 + }, + { + "epoch": 0.6756091557962097, + "grad_norm": 0.8485976457595825, + "learning_rate": 4.85973127335106e-06, + "loss": 0.6629, + "step": 1830 + }, + { + "epoch": 0.6759783411272459, + "grad_norm": 0.875453770160675, + "learning_rate": 4.859570602013175e-06, + "loss": 0.7349, + "step": 1831 + }, + { + "epoch": 0.6763475264582821, + "grad_norm": 0.8506700992584229, + "learning_rate": 4.859409841366287e-06, + "loss": 0.6818, + "step": 1832 + }, + { + "epoch": 0.6767167117893182, + "grad_norm": 0.8715220093727112, + "learning_rate": 4.859248991416481e-06, + "loss": 0.7189, + "step": 1833 + }, + { + "epoch": 0.6770858971203544, + "grad_norm": 0.9876884818077087, + "learning_rate": 4.859088052169845e-06, + "loss": 0.7445, + "step": 1834 + }, + { + "epoch": 0.6774550824513906, + "grad_norm": 0.8883238434791565, + "learning_rate": 4.858927023632472e-06, + "loss": 0.7338, + "step": 1835 + }, + { + "epoch": 0.6778242677824268, + "grad_norm": 0.8532505035400391, + "learning_rate": 4.858765905810455e-06, + "loss": 0.6844, + "step": 1836 + }, + { + "epoch": 0.6781934531134629, + "grad_norm": 0.8952226042747498, + "learning_rate": 4.8586046987098935e-06, + "loss": 0.6937, + "step": 1837 + }, + { + "epoch": 0.6785626384444992, + "grad_norm": 0.8690524101257324, + "learning_rate": 4.858443402336888e-06, + "loss": 0.7029, + "step": 1838 + }, + { + "epoch": 0.6789318237755353, + "grad_norm": 0.8886831998825073, + "learning_rate": 4.858282016697544e-06, + "loss": 0.728, + "step": 1839 + }, + { + "epoch": 0.6793010091065715, + "grad_norm": 0.9667893052101135, + "learning_rate": 4.85812054179797e-06, + "loss": 0.7331, + "step": 1840 + }, + { + "epoch": 0.6796701944376077, + "grad_norm": 0.8873503804206848, + "learning_rate": 4.857958977644278e-06, + "loss": 0.7073, + "step": 1841 + }, + { + "epoch": 0.6800393797686438, + "grad_norm": 0.8646997213363647, + "learning_rate": 4.857797324242582e-06, + "loss": 0.6857, + "step": 1842 + }, + { + "epoch": 0.6804085650996801, + "grad_norm": 0.8797730207443237, + "learning_rate": 4.857635581599003e-06, + "loss": 0.7488, + "step": 1843 + }, + { + "epoch": 0.6807777504307162, + "grad_norm": 0.8577935695648193, + "learning_rate": 4.85747374971966e-06, + "loss": 0.7206, + "step": 1844 + }, + { + "epoch": 0.6811469357617524, + "grad_norm": 0.871494472026825, + "learning_rate": 4.8573118286106805e-06, + "loss": 0.707, + "step": 1845 + }, + { + "epoch": 0.6815161210927886, + "grad_norm": 0.8755682110786438, + "learning_rate": 4.857149818278192e-06, + "loss": 0.7693, + "step": 1846 + }, + { + "epoch": 0.6818853064238247, + "grad_norm": 0.8881521821022034, + "learning_rate": 4.8569877187283255e-06, + "loss": 0.7216, + "step": 1847 + }, + { + "epoch": 0.682254491754861, + "grad_norm": 0.8813753724098206, + "learning_rate": 4.856825529967219e-06, + "loss": 0.6836, + "step": 1848 + }, + { + "epoch": 0.6826236770858971, + "grad_norm": 0.8883729577064514, + "learning_rate": 4.85666325200101e-06, + "loss": 0.6993, + "step": 1849 + }, + { + "epoch": 0.6829928624169334, + "grad_norm": 0.9154224395751953, + "learning_rate": 4.8565008848358405e-06, + "loss": 0.7361, + "step": 1850 + }, + { + "epoch": 0.6833620477479695, + "grad_norm": 0.8938775062561035, + "learning_rate": 4.856338428477856e-06, + "loss": 0.7191, + "step": 1851 + }, + { + "epoch": 0.6837312330790056, + "grad_norm": 0.8818868398666382, + "learning_rate": 4.8561758829332064e-06, + "loss": 0.687, + "step": 1852 + }, + { + "epoch": 0.6841004184100419, + "grad_norm": 0.9043888449668884, + "learning_rate": 4.856013248208043e-06, + "loss": 0.7375, + "step": 1853 + }, + { + "epoch": 0.684469603741078, + "grad_norm": 0.8634012341499329, + "learning_rate": 4.855850524308521e-06, + "loss": 0.7032, + "step": 1854 + }, + { + "epoch": 0.6848387890721142, + "grad_norm": 0.8524016737937927, + "learning_rate": 4.8556877112408e-06, + "loss": 0.6875, + "step": 1855 + }, + { + "epoch": 0.6852079744031504, + "grad_norm": 0.8432945013046265, + "learning_rate": 4.855524809011043e-06, + "loss": 0.6705, + "step": 1856 + }, + { + "epoch": 0.6855771597341865, + "grad_norm": 0.8706541657447815, + "learning_rate": 4.855361817625416e-06, + "loss": 0.7322, + "step": 1857 + }, + { + "epoch": 0.6859463450652228, + "grad_norm": 0.8889358043670654, + "learning_rate": 4.8551987370900875e-06, + "loss": 0.7295, + "step": 1858 + }, + { + "epoch": 0.6863155303962589, + "grad_norm": 0.8888987302780151, + "learning_rate": 4.8550355674112295e-06, + "loss": 0.7163, + "step": 1859 + }, + { + "epoch": 0.6866847157272951, + "grad_norm": 0.8585571646690369, + "learning_rate": 4.854872308595019e-06, + "loss": 0.7041, + "step": 1860 + }, + { + "epoch": 0.6870539010583313, + "grad_norm": 0.872947096824646, + "learning_rate": 4.8547089606476335e-06, + "loss": 0.7142, + "step": 1861 + }, + { + "epoch": 0.6874230863893674, + "grad_norm": 0.9010311961174011, + "learning_rate": 4.854545523575259e-06, + "loss": 0.729, + "step": 1862 + }, + { + "epoch": 0.6877922717204037, + "grad_norm": 0.8814377784729004, + "learning_rate": 4.854381997384079e-06, + "loss": 0.6988, + "step": 1863 + }, + { + "epoch": 0.6881614570514398, + "grad_norm": 0.8774176239967346, + "learning_rate": 4.854218382080283e-06, + "loss": 0.7103, + "step": 1864 + }, + { + "epoch": 0.688530642382476, + "grad_norm": 0.8719117641448975, + "learning_rate": 4.854054677670064e-06, + "loss": 0.7084, + "step": 1865 + }, + { + "epoch": 0.6888998277135122, + "grad_norm": 0.874610424041748, + "learning_rate": 4.853890884159619e-06, + "loss": 0.6931, + "step": 1866 + }, + { + "epoch": 0.6892690130445484, + "grad_norm": 0.9094902276992798, + "learning_rate": 4.853727001555146e-06, + "loss": 0.7156, + "step": 1867 + }, + { + "epoch": 0.6896381983755845, + "grad_norm": 0.8830024003982544, + "learning_rate": 4.85356302986285e-06, + "loss": 0.7119, + "step": 1868 + }, + { + "epoch": 0.6900073837066207, + "grad_norm": 0.9095394611358643, + "learning_rate": 4.853398969088936e-06, + "loss": 0.7219, + "step": 1869 + }, + { + "epoch": 0.6903765690376569, + "grad_norm": 0.8986232280731201, + "learning_rate": 4.853234819239613e-06, + "loss": 0.71, + "step": 1870 + }, + { + "epoch": 0.6907457543686931, + "grad_norm": 0.8806168437004089, + "learning_rate": 4.8530705803210955e-06, + "loss": 0.6731, + "step": 1871 + }, + { + "epoch": 0.6911149396997293, + "grad_norm": 0.9023184180259705, + "learning_rate": 4.852906252339598e-06, + "loss": 0.7086, + "step": 1872 + }, + { + "epoch": 0.6914841250307654, + "grad_norm": 0.8981339931488037, + "learning_rate": 4.852741835301343e-06, + "loss": 0.7037, + "step": 1873 + }, + { + "epoch": 0.6918533103618016, + "grad_norm": 0.9292305111885071, + "learning_rate": 4.852577329212551e-06, + "loss": 0.7096, + "step": 1874 + }, + { + "epoch": 0.6922224956928378, + "grad_norm": 0.903359591960907, + "learning_rate": 4.85241273407945e-06, + "loss": 0.6792, + "step": 1875 + }, + { + "epoch": 0.692591681023874, + "grad_norm": 0.893671452999115, + "learning_rate": 4.85224804990827e-06, + "loss": 0.7367, + "step": 1876 + }, + { + "epoch": 0.6929608663549102, + "grad_norm": 0.9473923444747925, + "learning_rate": 4.852083276705243e-06, + "loss": 0.6995, + "step": 1877 + }, + { + "epoch": 0.6933300516859463, + "grad_norm": 0.9057194590568542, + "learning_rate": 4.851918414476606e-06, + "loss": 0.7498, + "step": 1878 + }, + { + "epoch": 0.6936992370169826, + "grad_norm": 0.8625937700271606, + "learning_rate": 4.8517534632286e-06, + "loss": 0.6826, + "step": 1879 + }, + { + "epoch": 0.6940684223480187, + "grad_norm": 0.8820124864578247, + "learning_rate": 4.851588422967467e-06, + "loss": 0.7204, + "step": 1880 + }, + { + "epoch": 0.6944376076790549, + "grad_norm": 0.9044023752212524, + "learning_rate": 4.851423293699455e-06, + "loss": 0.7131, + "step": 1881 + }, + { + "epoch": 0.6948067930100911, + "grad_norm": 0.8808709979057312, + "learning_rate": 4.851258075430813e-06, + "loss": 0.68, + "step": 1882 + }, + { + "epoch": 0.6951759783411272, + "grad_norm": 0.875217854976654, + "learning_rate": 4.851092768167795e-06, + "loss": 0.7038, + "step": 1883 + }, + { + "epoch": 0.6955451636721635, + "grad_norm": 0.8607890605926514, + "learning_rate": 4.850927371916658e-06, + "loss": 0.6845, + "step": 1884 + }, + { + "epoch": 0.6959143490031996, + "grad_norm": 0.9063311815261841, + "learning_rate": 4.850761886683662e-06, + "loss": 0.7337, + "step": 1885 + }, + { + "epoch": 0.6962835343342357, + "grad_norm": 0.9188293814659119, + "learning_rate": 4.85059631247507e-06, + "loss": 0.6916, + "step": 1886 + }, + { + "epoch": 0.696652719665272, + "grad_norm": 0.913374125957489, + "learning_rate": 4.85043064929715e-06, + "loss": 0.759, + "step": 1887 + }, + { + "epoch": 0.6970219049963081, + "grad_norm": 0.8925139904022217, + "learning_rate": 4.850264897156171e-06, + "loss": 0.7173, + "step": 1888 + }, + { + "epoch": 0.6973910903273444, + "grad_norm": 0.9073317050933838, + "learning_rate": 4.8500990560584075e-06, + "loss": 0.7424, + "step": 1889 + }, + { + "epoch": 0.6977602756583805, + "grad_norm": 0.8561435341835022, + "learning_rate": 4.8499331260101365e-06, + "loss": 0.7152, + "step": 1890 + }, + { + "epoch": 0.6981294609894166, + "grad_norm": 0.8652263283729553, + "learning_rate": 4.8497671070176385e-06, + "loss": 0.6963, + "step": 1891 + }, + { + "epoch": 0.6984986463204529, + "grad_norm": 0.8837151527404785, + "learning_rate": 4.849600999087197e-06, + "loss": 0.7357, + "step": 1892 + }, + { + "epoch": 0.698867831651489, + "grad_norm": 0.8606382012367249, + "learning_rate": 4.8494348022251e-06, + "loss": 0.7131, + "step": 1893 + }, + { + "epoch": 0.6992370169825253, + "grad_norm": 0.8533560633659363, + "learning_rate": 4.8492685164376365e-06, + "loss": 0.695, + "step": 1894 + }, + { + "epoch": 0.6996062023135614, + "grad_norm": 0.9554173946380615, + "learning_rate": 4.849102141731101e-06, + "loss": 0.6683, + "step": 1895 + }, + { + "epoch": 0.6999753876445975, + "grad_norm": 0.9184376001358032, + "learning_rate": 4.848935678111792e-06, + "loss": 0.7477, + "step": 1896 + }, + { + "epoch": 0.7003445729756338, + "grad_norm": 0.9730942845344543, + "learning_rate": 4.848769125586007e-06, + "loss": 0.7392, + "step": 1897 + }, + { + "epoch": 0.7007137583066699, + "grad_norm": 0.8965922594070435, + "learning_rate": 4.848602484160053e-06, + "loss": 0.7275, + "step": 1898 + }, + { + "epoch": 0.7010829436377062, + "grad_norm": 0.9296282529830933, + "learning_rate": 4.848435753840236e-06, + "loss": 0.7158, + "step": 1899 + }, + { + "epoch": 0.7014521289687423, + "grad_norm": 0.9205310940742493, + "learning_rate": 4.8482689346328674e-06, + "loss": 0.6932, + "step": 1900 + }, + { + "epoch": 0.7018213142997785, + "grad_norm": 0.8746544718742371, + "learning_rate": 4.84810202654426e-06, + "loss": 0.6765, + "step": 1901 + }, + { + "epoch": 0.7021904996308147, + "grad_norm": 0.890127956867218, + "learning_rate": 4.847935029580732e-06, + "loss": 0.6847, + "step": 1902 + }, + { + "epoch": 0.7025596849618508, + "grad_norm": 0.8811405897140503, + "learning_rate": 4.847767943748605e-06, + "loss": 0.6643, + "step": 1903 + }, + { + "epoch": 0.702928870292887, + "grad_norm": 0.8962213397026062, + "learning_rate": 4.847600769054201e-06, + "loss": 0.7078, + "step": 1904 + }, + { + "epoch": 0.7032980556239232, + "grad_norm": 0.8875841498374939, + "learning_rate": 4.84743350550385e-06, + "loss": 0.707, + "step": 1905 + }, + { + "epoch": 0.7036672409549594, + "grad_norm": 0.8978013396263123, + "learning_rate": 4.8472661531038815e-06, + "loss": 0.6716, + "step": 1906 + }, + { + "epoch": 0.7040364262859956, + "grad_norm": 0.9015927910804749, + "learning_rate": 4.847098711860629e-06, + "loss": 0.6643, + "step": 1907 + }, + { + "epoch": 0.7044056116170317, + "grad_norm": 0.893324077129364, + "learning_rate": 4.846931181780431e-06, + "loss": 0.7005, + "step": 1908 + }, + { + "epoch": 0.7047747969480679, + "grad_norm": 0.9275810122489929, + "learning_rate": 4.84676356286963e-06, + "loss": 0.724, + "step": 1909 + }, + { + "epoch": 0.7051439822791041, + "grad_norm": 0.8800505995750427, + "learning_rate": 4.8465958551345675e-06, + "loss": 0.7251, + "step": 1910 + }, + { + "epoch": 0.7055131676101403, + "grad_norm": 0.8828743100166321, + "learning_rate": 4.846428058581593e-06, + "loss": 0.7277, + "step": 1911 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.9125253558158875, + "learning_rate": 4.846260173217056e-06, + "loss": 0.7619, + "step": 1912 + }, + { + "epoch": 0.7062515382722127, + "grad_norm": 0.9037159085273743, + "learning_rate": 4.846092199047314e-06, + "loss": 0.7182, + "step": 1913 + }, + { + "epoch": 0.7066207236032488, + "grad_norm": 0.8811241984367371, + "learning_rate": 4.8459241360787215e-06, + "loss": 0.749, + "step": 1914 + }, + { + "epoch": 0.706989908934285, + "grad_norm": 0.8958812952041626, + "learning_rate": 4.845755984317641e-06, + "loss": 0.7191, + "step": 1915 + }, + { + "epoch": 0.7073590942653212, + "grad_norm": 0.9097388386726379, + "learning_rate": 4.845587743770436e-06, + "loss": 0.7166, + "step": 1916 + }, + { + "epoch": 0.7077282795963574, + "grad_norm": 0.8992327451705933, + "learning_rate": 4.8454194144434765e-06, + "loss": 0.7493, + "step": 1917 + }, + { + "epoch": 0.7080974649273936, + "grad_norm": 0.9153758883476257, + "learning_rate": 4.845250996343132e-06, + "loss": 0.7036, + "step": 1918 + }, + { + "epoch": 0.7084666502584297, + "grad_norm": 0.9150279760360718, + "learning_rate": 4.845082489475777e-06, + "loss": 0.7278, + "step": 1919 + }, + { + "epoch": 0.708835835589466, + "grad_norm": 0.8316643238067627, + "learning_rate": 4.84491389384779e-06, + "loss": 0.6683, + "step": 1920 + }, + { + "epoch": 0.7092050209205021, + "grad_norm": 0.8955804705619812, + "learning_rate": 4.844745209465552e-06, + "loss": 0.7344, + "step": 1921 + }, + { + "epoch": 0.7095742062515382, + "grad_norm": 0.9217088222503662, + "learning_rate": 4.844576436335448e-06, + "loss": 0.6768, + "step": 1922 + }, + { + "epoch": 0.7099433915825745, + "grad_norm": 0.9741485714912415, + "learning_rate": 4.844407574463866e-06, + "loss": 0.713, + "step": 1923 + }, + { + "epoch": 0.7103125769136106, + "grad_norm": 0.9079376459121704, + "learning_rate": 4.844238623857197e-06, + "loss": 0.7543, + "step": 1924 + }, + { + "epoch": 0.7106817622446469, + "grad_norm": 0.8815346956253052, + "learning_rate": 4.844069584521836e-06, + "loss": 0.7318, + "step": 1925 + }, + { + "epoch": 0.711050947575683, + "grad_norm": 0.9132115244865417, + "learning_rate": 4.843900456464181e-06, + "loss": 0.7084, + "step": 1926 + }, + { + "epoch": 0.7114201329067191, + "grad_norm": 0.9009430408477783, + "learning_rate": 4.843731239690634e-06, + "loss": 0.7461, + "step": 1927 + }, + { + "epoch": 0.7117893182377554, + "grad_norm": 0.847509503364563, + "learning_rate": 4.843561934207599e-06, + "loss": 0.6802, + "step": 1928 + }, + { + "epoch": 0.7121585035687915, + "grad_norm": 0.8662732243537903, + "learning_rate": 4.843392540021485e-06, + "loss": 0.6892, + "step": 1929 + }, + { + "epoch": 0.7125276888998278, + "grad_norm": 0.9007134437561035, + "learning_rate": 4.843223057138701e-06, + "loss": 0.7029, + "step": 1930 + }, + { + "epoch": 0.7128968742308639, + "grad_norm": 0.882623553276062, + "learning_rate": 4.8430534855656655e-06, + "loss": 0.6805, + "step": 1931 + }, + { + "epoch": 0.7132660595619, + "grad_norm": 0.91568922996521, + "learning_rate": 4.842883825308794e-06, + "loss": 0.7135, + "step": 1932 + }, + { + "epoch": 0.7136352448929363, + "grad_norm": 0.9128447771072388, + "learning_rate": 4.84271407637451e-06, + "loss": 0.7521, + "step": 1933 + }, + { + "epoch": 0.7140044302239724, + "grad_norm": 0.890592098236084, + "learning_rate": 4.842544238769238e-06, + "loss": 0.7424, + "step": 1934 + }, + { + "epoch": 0.7143736155550087, + "grad_norm": 0.8989784717559814, + "learning_rate": 4.842374312499405e-06, + "loss": 0.7288, + "step": 1935 + }, + { + "epoch": 0.7147428008860448, + "grad_norm": 0.8863556981086731, + "learning_rate": 4.842204297571444e-06, + "loss": 0.7189, + "step": 1936 + }, + { + "epoch": 0.7151119862170809, + "grad_norm": 0.9078112840652466, + "learning_rate": 4.842034193991789e-06, + "loss": 0.7186, + "step": 1937 + }, + { + "epoch": 0.7154811715481172, + "grad_norm": 0.8869308829307556, + "learning_rate": 4.841864001766879e-06, + "loss": 0.7004, + "step": 1938 + }, + { + "epoch": 0.7158503568791533, + "grad_norm": 0.8935455083847046, + "learning_rate": 4.8416937209031555e-06, + "loss": 0.7231, + "step": 1939 + }, + { + "epoch": 0.7162195422101895, + "grad_norm": 0.877627968788147, + "learning_rate": 4.841523351407064e-06, + "loss": 0.6868, + "step": 1940 + }, + { + "epoch": 0.7165887275412257, + "grad_norm": 0.8890548944473267, + "learning_rate": 4.841352893285053e-06, + "loss": 0.7483, + "step": 1941 + }, + { + "epoch": 0.7169579128722618, + "grad_norm": 0.8524648547172546, + "learning_rate": 4.841182346543574e-06, + "loss": 0.7152, + "step": 1942 + }, + { + "epoch": 0.7173270982032981, + "grad_norm": 0.8882040977478027, + "learning_rate": 4.841011711189081e-06, + "loss": 0.7057, + "step": 1943 + }, + { + "epoch": 0.7176962835343342, + "grad_norm": 0.8520607352256775, + "learning_rate": 4.840840987228035e-06, + "loss": 0.6976, + "step": 1944 + }, + { + "epoch": 0.7180654688653704, + "grad_norm": 0.883282482624054, + "learning_rate": 4.840670174666896e-06, + "loss": 0.6937, + "step": 1945 + }, + { + "epoch": 0.7184346541964066, + "grad_norm": 0.8982214331626892, + "learning_rate": 4.840499273512129e-06, + "loss": 0.6427, + "step": 1946 + }, + { + "epoch": 0.7188038395274428, + "grad_norm": 0.8696303367614746, + "learning_rate": 4.840328283770203e-06, + "loss": 0.7672, + "step": 1947 + }, + { + "epoch": 0.719173024858479, + "grad_norm": 0.8526875972747803, + "learning_rate": 4.840157205447591e-06, + "loss": 0.7058, + "step": 1948 + }, + { + "epoch": 0.7195422101895151, + "grad_norm": 0.8616173267364502, + "learning_rate": 4.839986038550767e-06, + "loss": 0.6861, + "step": 1949 + }, + { + "epoch": 0.7199113955205513, + "grad_norm": 0.8847987651824951, + "learning_rate": 4.83981478308621e-06, + "loss": 0.7424, + "step": 1950 + }, + { + "epoch": 0.7202805808515875, + "grad_norm": 0.9206272959709167, + "learning_rate": 4.839643439060401e-06, + "loss": 0.712, + "step": 1951 + }, + { + "epoch": 0.7206497661826237, + "grad_norm": 0.8666979670524597, + "learning_rate": 4.8394720064798275e-06, + "loss": 0.7077, + "step": 1952 + }, + { + "epoch": 0.7210189515136599, + "grad_norm": 0.8352959752082825, + "learning_rate": 4.839300485350976e-06, + "loss": 0.6755, + "step": 1953 + }, + { + "epoch": 0.721388136844696, + "grad_norm": 0.8747813105583191, + "learning_rate": 4.83912887568034e-06, + "loss": 0.7047, + "step": 1954 + }, + { + "epoch": 0.7217573221757322, + "grad_norm": 0.9319245219230652, + "learning_rate": 4.838957177474414e-06, + "loss": 0.752, + "step": 1955 + }, + { + "epoch": 0.7221265075067684, + "grad_norm": 0.8869343996047974, + "learning_rate": 4.838785390739698e-06, + "loss": 0.7211, + "step": 1956 + }, + { + "epoch": 0.7224956928378046, + "grad_norm": 0.9012927412986755, + "learning_rate": 4.838613515482692e-06, + "loss": 0.7184, + "step": 1957 + }, + { + "epoch": 0.7228648781688407, + "grad_norm": 0.8678253293037415, + "learning_rate": 4.838441551709902e-06, + "loss": 0.7164, + "step": 1958 + }, + { + "epoch": 0.723234063499877, + "grad_norm": 0.8944018483161926, + "learning_rate": 4.838269499427838e-06, + "loss": 0.7141, + "step": 1959 + }, + { + "epoch": 0.7236032488309131, + "grad_norm": 0.868712306022644, + "learning_rate": 4.838097358643012e-06, + "loss": 0.6841, + "step": 1960 + }, + { + "epoch": 0.7239724341619493, + "grad_norm": 0.8801015615463257, + "learning_rate": 4.837925129361938e-06, + "loss": 0.6913, + "step": 1961 + }, + { + "epoch": 0.7243416194929855, + "grad_norm": 0.8590583801269531, + "learning_rate": 4.837752811591136e-06, + "loss": 0.7065, + "step": 1962 + }, + { + "epoch": 0.7247108048240216, + "grad_norm": 0.895354151725769, + "learning_rate": 4.837580405337128e-06, + "loss": 0.7574, + "step": 1963 + }, + { + "epoch": 0.7250799901550579, + "grad_norm": 0.8873820304870605, + "learning_rate": 4.83740791060644e-06, + "loss": 0.7083, + "step": 1964 + }, + { + "epoch": 0.725449175486094, + "grad_norm": 0.8843436241149902, + "learning_rate": 4.837235327405599e-06, + "loss": 0.7178, + "step": 1965 + }, + { + "epoch": 0.7258183608171302, + "grad_norm": 0.9182543158531189, + "learning_rate": 4.837062655741139e-06, + "loss": 0.7311, + "step": 1966 + }, + { + "epoch": 0.7261875461481664, + "grad_norm": 0.8999276161193848, + "learning_rate": 4.836889895619595e-06, + "loss": 0.7329, + "step": 1967 + }, + { + "epoch": 0.7265567314792025, + "grad_norm": 0.9033870697021484, + "learning_rate": 4.836717047047507e-06, + "loss": 0.7254, + "step": 1968 + }, + { + "epoch": 0.7269259168102388, + "grad_norm": 0.8504629731178284, + "learning_rate": 4.836544110031415e-06, + "loss": 0.6982, + "step": 1969 + }, + { + "epoch": 0.7272951021412749, + "grad_norm": 0.8877577185630798, + "learning_rate": 4.836371084577867e-06, + "loss": 0.7121, + "step": 1970 + }, + { + "epoch": 0.7276642874723112, + "grad_norm": 0.845675528049469, + "learning_rate": 4.8361979706934096e-06, + "loss": 0.7187, + "step": 1971 + }, + { + "epoch": 0.7280334728033473, + "grad_norm": 0.8912767767906189, + "learning_rate": 4.836024768384597e-06, + "loss": 0.7221, + "step": 1972 + }, + { + "epoch": 0.7284026581343834, + "grad_norm": 0.9066804647445679, + "learning_rate": 4.8358514776579835e-06, + "loss": 0.737, + "step": 1973 + }, + { + "epoch": 0.7287718434654197, + "grad_norm": 0.874573826789856, + "learning_rate": 4.83567809852013e-06, + "loss": 0.6928, + "step": 1974 + }, + { + "epoch": 0.7291410287964558, + "grad_norm": 0.9108884930610657, + "learning_rate": 4.835504630977597e-06, + "loss": 0.6933, + "step": 1975 + }, + { + "epoch": 0.729510214127492, + "grad_norm": 0.8197352290153503, + "learning_rate": 4.8353310750369496e-06, + "loss": 0.6412, + "step": 1976 + }, + { + "epoch": 0.7298793994585282, + "grad_norm": 0.8398105502128601, + "learning_rate": 4.83515743070476e-06, + "loss": 0.6667, + "step": 1977 + }, + { + "epoch": 0.7302485847895643, + "grad_norm": 0.8997299075126648, + "learning_rate": 4.834983697987597e-06, + "loss": 0.6776, + "step": 1978 + }, + { + "epoch": 0.7306177701206006, + "grad_norm": 0.8984546661376953, + "learning_rate": 4.834809876892039e-06, + "loss": 0.7515, + "step": 1979 + }, + { + "epoch": 0.7309869554516367, + "grad_norm": 0.8575339913368225, + "learning_rate": 4.834635967424664e-06, + "loss": 0.7179, + "step": 1980 + }, + { + "epoch": 0.7313561407826729, + "grad_norm": 0.8993721008300781, + "learning_rate": 4.8344619695920545e-06, + "loss": 0.7405, + "step": 1981 + }, + { + "epoch": 0.7317253261137091, + "grad_norm": 0.8856477737426758, + "learning_rate": 4.8342878834007955e-06, + "loss": 0.766, + "step": 1982 + }, + { + "epoch": 0.7320945114447452, + "grad_norm": 0.8776651620864868, + "learning_rate": 4.834113708857477e-06, + "loss": 0.6993, + "step": 1983 + }, + { + "epoch": 0.7324636967757815, + "grad_norm": 0.8624359965324402, + "learning_rate": 4.8339394459686925e-06, + "loss": 0.7049, + "step": 1984 + }, + { + "epoch": 0.7328328821068176, + "grad_norm": 0.9040077924728394, + "learning_rate": 4.833765094741035e-06, + "loss": 0.6616, + "step": 1985 + }, + { + "epoch": 0.7332020674378538, + "grad_norm": 0.8632827997207642, + "learning_rate": 4.833590655181106e-06, + "loss": 0.6911, + "step": 1986 + }, + { + "epoch": 0.73357125276889, + "grad_norm": 0.869616687297821, + "learning_rate": 4.833416127295507e-06, + "loss": 0.7216, + "step": 1987 + }, + { + "epoch": 0.7339404380999262, + "grad_norm": 0.8821524977684021, + "learning_rate": 4.833241511090845e-06, + "loss": 0.7398, + "step": 1988 + }, + { + "epoch": 0.7343096234309623, + "grad_norm": 0.9026939272880554, + "learning_rate": 4.833066806573727e-06, + "loss": 0.6981, + "step": 1989 + }, + { + "epoch": 0.7346788087619985, + "grad_norm": 0.9178372621536255, + "learning_rate": 4.8328920137507665e-06, + "loss": 0.7219, + "step": 1990 + }, + { + "epoch": 0.7350479940930347, + "grad_norm": 0.8885320425033569, + "learning_rate": 4.832717132628581e-06, + "loss": 0.7093, + "step": 1991 + }, + { + "epoch": 0.7354171794240709, + "grad_norm": 0.8704083561897278, + "learning_rate": 4.832542163213787e-06, + "loss": 0.7475, + "step": 1992 + }, + { + "epoch": 0.7357863647551071, + "grad_norm": 0.8328550457954407, + "learning_rate": 4.832367105513008e-06, + "loss": 0.6655, + "step": 1993 + }, + { + "epoch": 0.7361555500861432, + "grad_norm": 0.900768518447876, + "learning_rate": 4.832191959532871e-06, + "loss": 0.6967, + "step": 1994 + }, + { + "epoch": 0.7365247354171794, + "grad_norm": 0.8467245697975159, + "learning_rate": 4.832016725280005e-06, + "loss": 0.6984, + "step": 1995 + }, + { + "epoch": 0.7368939207482156, + "grad_norm": 0.8155560493469238, + "learning_rate": 4.83184140276104e-06, + "loss": 0.6468, + "step": 1996 + }, + { + "epoch": 0.7372631060792518, + "grad_norm": 0.8829312920570374, + "learning_rate": 4.831665991982615e-06, + "loss": 0.7444, + "step": 1997 + }, + { + "epoch": 0.737632291410288, + "grad_norm": 0.8596073389053345, + "learning_rate": 4.831490492951368e-06, + "loss": 0.686, + "step": 1998 + }, + { + "epoch": 0.7380014767413241, + "grad_norm": 0.895241916179657, + "learning_rate": 4.831314905673942e-06, + "loss": 0.6905, + "step": 1999 + }, + { + "epoch": 0.7383706620723604, + "grad_norm": 0.8932267427444458, + "learning_rate": 4.831139230156982e-06, + "loss": 0.6809, + "step": 2000 + }, + { + "epoch": 0.7387398474033965, + "grad_norm": 0.8914393186569214, + "learning_rate": 4.8309634664071385e-06, + "loss": 0.6851, + "step": 2001 + }, + { + "epoch": 0.7391090327344327, + "grad_norm": 0.8881421685218811, + "learning_rate": 4.830787614431062e-06, + "loss": 0.7322, + "step": 2002 + }, + { + "epoch": 0.7394782180654689, + "grad_norm": 0.8394930958747864, + "learning_rate": 4.830611674235411e-06, + "loss": 0.6989, + "step": 2003 + }, + { + "epoch": 0.739847403396505, + "grad_norm": 0.8811700940132141, + "learning_rate": 4.830435645826844e-06, + "loss": 0.7009, + "step": 2004 + }, + { + "epoch": 0.7402165887275413, + "grad_norm": 0.8765476942062378, + "learning_rate": 4.830259529212023e-06, + "loss": 0.7325, + "step": 2005 + }, + { + "epoch": 0.7405857740585774, + "grad_norm": 0.8366778492927551, + "learning_rate": 4.830083324397614e-06, + "loss": 0.6912, + "step": 2006 + }, + { + "epoch": 0.7409549593896135, + "grad_norm": 0.8748518228530884, + "learning_rate": 4.829907031390287e-06, + "loss": 0.7269, + "step": 2007 + }, + { + "epoch": 0.7413241447206498, + "grad_norm": 0.93858802318573, + "learning_rate": 4.829730650196714e-06, + "loss": 0.7176, + "step": 2008 + }, + { + "epoch": 0.7416933300516859, + "grad_norm": 0.8684030771255493, + "learning_rate": 4.8295541808235715e-06, + "loss": 0.7018, + "step": 2009 + }, + { + "epoch": 0.7420625153827222, + "grad_norm": 0.8710829019546509, + "learning_rate": 4.829377623277538e-06, + "loss": 0.7137, + "step": 2010 + }, + { + "epoch": 0.7424317007137583, + "grad_norm": 0.8900696039199829, + "learning_rate": 4.829200977565296e-06, + "loss": 0.695, + "step": 2011 + }, + { + "epoch": 0.7428008860447944, + "grad_norm": 0.8644715547561646, + "learning_rate": 4.8290242436935334e-06, + "loss": 0.6685, + "step": 2012 + }, + { + "epoch": 0.7431700713758307, + "grad_norm": 0.8664246797561646, + "learning_rate": 4.828847421668938e-06, + "loss": 0.6827, + "step": 2013 + }, + { + "epoch": 0.7435392567068668, + "grad_norm": 0.9115155935287476, + "learning_rate": 4.828670511498202e-06, + "loss": 0.7093, + "step": 2014 + }, + { + "epoch": 0.7439084420379031, + "grad_norm": 0.9124789834022522, + "learning_rate": 4.828493513188022e-06, + "loss": 0.7339, + "step": 2015 + }, + { + "epoch": 0.7442776273689392, + "grad_norm": 0.8920673131942749, + "learning_rate": 4.828316426745098e-06, + "loss": 0.7109, + "step": 2016 + }, + { + "epoch": 0.7446468126999753, + "grad_norm": 0.9076531529426575, + "learning_rate": 4.828139252176131e-06, + "loss": 0.7117, + "step": 2017 + }, + { + "epoch": 0.7450159980310116, + "grad_norm": 0.8628717660903931, + "learning_rate": 4.82796198948783e-06, + "loss": 0.7317, + "step": 2018 + }, + { + "epoch": 0.7453851833620477, + "grad_norm": 0.8688773512840271, + "learning_rate": 4.827784638686901e-06, + "loss": 0.7004, + "step": 2019 + }, + { + "epoch": 0.745754368693084, + "grad_norm": 0.8732299208641052, + "learning_rate": 4.827607199780059e-06, + "loss": 0.7333, + "step": 2020 + }, + { + "epoch": 0.7461235540241201, + "grad_norm": 0.8918249011039734, + "learning_rate": 4.8274296727740185e-06, + "loss": 0.6741, + "step": 2021 + }, + { + "epoch": 0.7464927393551563, + "grad_norm": 0.8658198714256287, + "learning_rate": 4.827252057675499e-06, + "loss": 0.7014, + "step": 2022 + }, + { + "epoch": 0.7468619246861925, + "grad_norm": 0.9010634422302246, + "learning_rate": 4.827074354491225e-06, + "loss": 0.7067, + "step": 2023 + }, + { + "epoch": 0.7472311100172286, + "grad_norm": 0.8897079825401306, + "learning_rate": 4.8268965632279194e-06, + "loss": 0.7272, + "step": 2024 + }, + { + "epoch": 0.7476002953482648, + "grad_norm": 0.8660743236541748, + "learning_rate": 4.8267186838923145e-06, + "loss": 0.7262, + "step": 2025 + }, + { + "epoch": 0.747969480679301, + "grad_norm": 0.8774269819259644, + "learning_rate": 4.826540716491141e-06, + "loss": 0.7368, + "step": 2026 + }, + { + "epoch": 0.7483386660103372, + "grad_norm": 0.8606297969818115, + "learning_rate": 4.826362661031136e-06, + "loss": 0.7107, + "step": 2027 + }, + { + "epoch": 0.7487078513413734, + "grad_norm": 0.8835901618003845, + "learning_rate": 4.826184517519038e-06, + "loss": 0.7234, + "step": 2028 + }, + { + "epoch": 0.7490770366724095, + "grad_norm": 0.8923384547233582, + "learning_rate": 4.8260062859615915e-06, + "loss": 0.68, + "step": 2029 + }, + { + "epoch": 0.7494462220034457, + "grad_norm": 0.8915189504623413, + "learning_rate": 4.825827966365541e-06, + "loss": 0.6991, + "step": 2030 + }, + { + "epoch": 0.7498154073344819, + "grad_norm": 0.8923548460006714, + "learning_rate": 4.825649558737635e-06, + "loss": 0.75, + "step": 2031 + }, + { + "epoch": 0.7501845926655181, + "grad_norm": 0.8493019342422485, + "learning_rate": 4.825471063084627e-06, + "loss": 0.6659, + "step": 2032 + }, + { + "epoch": 0.7505537779965543, + "grad_norm": 0.8680519461631775, + "learning_rate": 4.825292479413274e-06, + "loss": 0.6889, + "step": 2033 + }, + { + "epoch": 0.7509229633275905, + "grad_norm": 0.9089414477348328, + "learning_rate": 4.825113807730334e-06, + "loss": 0.7363, + "step": 2034 + }, + { + "epoch": 0.7512921486586266, + "grad_norm": 0.8787106275558472, + "learning_rate": 4.8249350480425704e-06, + "loss": 0.6808, + "step": 2035 + }, + { + "epoch": 0.7516613339896628, + "grad_norm": 0.8691989779472351, + "learning_rate": 4.8247562003567486e-06, + "loss": 0.6974, + "step": 2036 + }, + { + "epoch": 0.752030519320699, + "grad_norm": 0.8437734246253967, + "learning_rate": 4.824577264679639e-06, + "loss": 0.6861, + "step": 2037 + }, + { + "epoch": 0.7523997046517352, + "grad_norm": 0.8832452893257141, + "learning_rate": 4.824398241018014e-06, + "loss": 0.7166, + "step": 2038 + }, + { + "epoch": 0.7527688899827714, + "grad_norm": 0.8944876194000244, + "learning_rate": 4.824219129378648e-06, + "loss": 0.7272, + "step": 2039 + }, + { + "epoch": 0.7531380753138075, + "grad_norm": 0.8749659657478333, + "learning_rate": 4.824039929768322e-06, + "loss": 0.7176, + "step": 2040 + }, + { + "epoch": 0.7535072606448437, + "grad_norm": 0.8656901717185974, + "learning_rate": 4.823860642193818e-06, + "loss": 0.7335, + "step": 2041 + }, + { + "epoch": 0.7538764459758799, + "grad_norm": 0.874985933303833, + "learning_rate": 4.823681266661923e-06, + "loss": 0.6942, + "step": 2042 + }, + { + "epoch": 0.754245631306916, + "grad_norm": 0.8945735692977905, + "learning_rate": 4.823501803179424e-06, + "loss": 0.733, + "step": 2043 + }, + { + "epoch": 0.7546148166379523, + "grad_norm": 0.8628880977630615, + "learning_rate": 4.823322251753115e-06, + "loss": 0.7065, + "step": 2044 + }, + { + "epoch": 0.7549840019689884, + "grad_norm": 0.850796639919281, + "learning_rate": 4.823142612389793e-06, + "loss": 0.6986, + "step": 2045 + }, + { + "epoch": 0.7553531873000247, + "grad_norm": 0.8502240180969238, + "learning_rate": 4.822962885096256e-06, + "loss": 0.7364, + "step": 2046 + }, + { + "epoch": 0.7557223726310608, + "grad_norm": 0.8640246987342834, + "learning_rate": 4.8227830698793076e-06, + "loss": 0.7207, + "step": 2047 + }, + { + "epoch": 0.7560915579620969, + "grad_norm": 0.8806377649307251, + "learning_rate": 4.8226031667457516e-06, + "loss": 0.7591, + "step": 2048 + }, + { + "epoch": 0.7564607432931332, + "grad_norm": 0.9259816408157349, + "learning_rate": 4.8224231757024e-06, + "loss": 0.7162, + "step": 2049 + }, + { + "epoch": 0.7568299286241693, + "grad_norm": 0.8613938689231873, + "learning_rate": 4.822243096756064e-06, + "loss": 0.7024, + "step": 2050 + }, + { + "epoch": 0.7571991139552056, + "grad_norm": 0.8919888734817505, + "learning_rate": 4.822062929913559e-06, + "loss": 0.7056, + "step": 2051 + }, + { + "epoch": 0.7575682992862417, + "grad_norm": 0.8805553913116455, + "learning_rate": 4.821882675181706e-06, + "loss": 0.7197, + "step": 2052 + }, + { + "epoch": 0.7579374846172778, + "grad_norm": 0.8903268575668335, + "learning_rate": 4.821702332567326e-06, + "loss": 0.7158, + "step": 2053 + }, + { + "epoch": 0.7583066699483141, + "grad_norm": 0.8912745118141174, + "learning_rate": 4.8215219020772455e-06, + "loss": 0.6996, + "step": 2054 + }, + { + "epoch": 0.7586758552793502, + "grad_norm": 0.8754813075065613, + "learning_rate": 4.821341383718293e-06, + "loss": 0.7052, + "step": 2055 + }, + { + "epoch": 0.7590450406103865, + "grad_norm": 0.8683738708496094, + "learning_rate": 4.821160777497303e-06, + "loss": 0.7288, + "step": 2056 + }, + { + "epoch": 0.7594142259414226, + "grad_norm": 0.8737871646881104, + "learning_rate": 4.820980083421109e-06, + "loss": 0.7289, + "step": 2057 + }, + { + "epoch": 0.7597834112724587, + "grad_norm": 0.8430957794189453, + "learning_rate": 4.820799301496552e-06, + "loss": 0.6643, + "step": 2058 + }, + { + "epoch": 0.760152596603495, + "grad_norm": 0.8619078397750854, + "learning_rate": 4.820618431730474e-06, + "loss": 0.6849, + "step": 2059 + }, + { + "epoch": 0.7605217819345311, + "grad_norm": 0.8729053139686584, + "learning_rate": 4.820437474129721e-06, + "loss": 0.7081, + "step": 2060 + }, + { + "epoch": 0.7608909672655673, + "grad_norm": 0.8700425028800964, + "learning_rate": 4.820256428701141e-06, + "loss": 0.6952, + "step": 2061 + }, + { + "epoch": 0.7612601525966035, + "grad_norm": 0.8315994143486023, + "learning_rate": 4.8200752954515885e-06, + "loss": 0.6861, + "step": 2062 + }, + { + "epoch": 0.7616293379276396, + "grad_norm": 0.8591640591621399, + "learning_rate": 4.819894074387917e-06, + "loss": 0.7155, + "step": 2063 + }, + { + "epoch": 0.7619985232586759, + "grad_norm": 0.9315993189811707, + "learning_rate": 4.8197127655169885e-06, + "loss": 0.7121, + "step": 2064 + }, + { + "epoch": 0.762367708589712, + "grad_norm": 0.8699647188186646, + "learning_rate": 4.819531368845662e-06, + "loss": 0.7095, + "step": 2065 + }, + { + "epoch": 0.7627368939207482, + "grad_norm": 0.8588031530380249, + "learning_rate": 4.819349884380807e-06, + "loss": 0.6605, + "step": 2066 + }, + { + "epoch": 0.7631060792517844, + "grad_norm": 0.898235559463501, + "learning_rate": 4.81916831212929e-06, + "loss": 0.7332, + "step": 2067 + }, + { + "epoch": 0.7634752645828206, + "grad_norm": 0.8969528079032898, + "learning_rate": 4.818986652097985e-06, + "loss": 0.7423, + "step": 2068 + }, + { + "epoch": 0.7638444499138568, + "grad_norm": 0.8572843074798584, + "learning_rate": 4.818804904293767e-06, + "loss": 0.7698, + "step": 2069 + }, + { + "epoch": 0.7642136352448929, + "grad_norm": 0.8814283609390259, + "learning_rate": 4.8186230687235145e-06, + "loss": 0.709, + "step": 2070 + }, + { + "epoch": 0.7645828205759291, + "grad_norm": 0.9045025110244751, + "learning_rate": 4.818441145394111e-06, + "loss": 0.7065, + "step": 2071 + }, + { + "epoch": 0.7649520059069653, + "grad_norm": 0.8909565210342407, + "learning_rate": 4.818259134312442e-06, + "loss": 0.6989, + "step": 2072 + }, + { + "epoch": 0.7653211912380015, + "grad_norm": 0.874191164970398, + "learning_rate": 4.818077035485396e-06, + "loss": 0.6966, + "step": 2073 + }, + { + "epoch": 0.7656903765690377, + "grad_norm": 0.8939563035964966, + "learning_rate": 4.817894848919866e-06, + "loss": 0.7279, + "step": 2074 + }, + { + "epoch": 0.7660595619000738, + "grad_norm": 0.8762744665145874, + "learning_rate": 4.817712574622748e-06, + "loss": 0.7097, + "step": 2075 + }, + { + "epoch": 0.76642874723111, + "grad_norm": 0.9104212522506714, + "learning_rate": 4.81753021260094e-06, + "loss": 0.7419, + "step": 2076 + }, + { + "epoch": 0.7667979325621462, + "grad_norm": 0.8775395154953003, + "learning_rate": 4.817347762861345e-06, + "loss": 0.6853, + "step": 2077 + }, + { + "epoch": 0.7671671178931824, + "grad_norm": 0.9023780822753906, + "learning_rate": 4.817165225410868e-06, + "loss": 0.7256, + "step": 2078 + }, + { + "epoch": 0.7675363032242185, + "grad_norm": 0.8750520944595337, + "learning_rate": 4.816982600256419e-06, + "loss": 0.6715, + "step": 2079 + }, + { + "epoch": 0.7679054885552548, + "grad_norm": 0.8693894743919373, + "learning_rate": 4.816799887404911e-06, + "loss": 0.694, + "step": 2080 + }, + { + "epoch": 0.7682746738862909, + "grad_norm": 0.8720349073410034, + "learning_rate": 4.816617086863256e-06, + "loss": 0.7284, + "step": 2081 + }, + { + "epoch": 0.7686438592173271, + "grad_norm": 0.8731261491775513, + "learning_rate": 4.816434198638378e-06, + "loss": 0.7152, + "step": 2082 + }, + { + "epoch": 0.7690130445483633, + "grad_norm": 0.8482458591461182, + "learning_rate": 4.816251222737195e-06, + "loss": 0.6944, + "step": 2083 + }, + { + "epoch": 0.7693822298793994, + "grad_norm": 0.9315195083618164, + "learning_rate": 4.816068159166635e-06, + "loss": 0.6859, + "step": 2084 + }, + { + "epoch": 0.7697514152104357, + "grad_norm": 0.8510304689407349, + "learning_rate": 4.815885007933625e-06, + "loss": 0.6863, + "step": 2085 + }, + { + "epoch": 0.7701206005414718, + "grad_norm": 0.8732700347900391, + "learning_rate": 4.815701769045099e-06, + "loss": 0.7242, + "step": 2086 + }, + { + "epoch": 0.770489785872508, + "grad_norm": 0.8937168121337891, + "learning_rate": 4.815518442507992e-06, + "loss": 0.7239, + "step": 2087 + }, + { + "epoch": 0.7708589712035442, + "grad_norm": 0.8584597706794739, + "learning_rate": 4.815335028329243e-06, + "loss": 0.7197, + "step": 2088 + }, + { + "epoch": 0.7712281565345803, + "grad_norm": 0.8576000928878784, + "learning_rate": 4.815151526515794e-06, + "loss": 0.6708, + "step": 2089 + }, + { + "epoch": 0.7715973418656166, + "grad_norm": 0.9142547845840454, + "learning_rate": 4.814967937074589e-06, + "loss": 0.7148, + "step": 2090 + }, + { + "epoch": 0.7719665271966527, + "grad_norm": 0.8664703369140625, + "learning_rate": 4.81478426001258e-06, + "loss": 0.7159, + "step": 2091 + }, + { + "epoch": 0.7723357125276888, + "grad_norm": 0.8845775723457336, + "learning_rate": 4.814600495336716e-06, + "loss": 0.7031, + "step": 2092 + }, + { + "epoch": 0.7727048978587251, + "grad_norm": 0.8626143932342529, + "learning_rate": 4.8144166430539555e-06, + "loss": 0.6828, + "step": 2093 + }, + { + "epoch": 0.7730740831897612, + "grad_norm": 0.8681825399398804, + "learning_rate": 4.814232703171254e-06, + "loss": 0.7169, + "step": 2094 + }, + { + "epoch": 0.7734432685207975, + "grad_norm": 0.9187625050544739, + "learning_rate": 4.8140486756955755e-06, + "loss": 0.7019, + "step": 2095 + }, + { + "epoch": 0.7738124538518336, + "grad_norm": 0.8887724876403809, + "learning_rate": 4.813864560633885e-06, + "loss": 0.7349, + "step": 2096 + }, + { + "epoch": 0.7741816391828698, + "grad_norm": 0.8629727959632874, + "learning_rate": 4.81368035799315e-06, + "loss": 0.7328, + "step": 2097 + }, + { + "epoch": 0.774550824513906, + "grad_norm": 0.8609279990196228, + "learning_rate": 4.813496067780345e-06, + "loss": 0.6909, + "step": 2098 + }, + { + "epoch": 0.7749200098449421, + "grad_norm": 0.8608946800231934, + "learning_rate": 4.813311690002444e-06, + "loss": 0.6885, + "step": 2099 + }, + { + "epoch": 0.7752891951759784, + "grad_norm": 0.8858616352081299, + "learning_rate": 4.813127224666425e-06, + "loss": 0.6919, + "step": 2100 + }, + { + "epoch": 0.7756583805070145, + "grad_norm": 0.8694943785667419, + "learning_rate": 4.812942671779271e-06, + "loss": 0.6736, + "step": 2101 + }, + { + "epoch": 0.7760275658380507, + "grad_norm": 0.8370912671089172, + "learning_rate": 4.812758031347967e-06, + "loss": 0.6608, + "step": 2102 + }, + { + "epoch": 0.7763967511690869, + "grad_norm": 0.8685939908027649, + "learning_rate": 4.812573303379501e-06, + "loss": 0.74, + "step": 2103 + }, + { + "epoch": 0.776765936500123, + "grad_norm": 0.8643637299537659, + "learning_rate": 4.812388487880865e-06, + "loss": 0.6988, + "step": 2104 + }, + { + "epoch": 0.7771351218311593, + "grad_norm": 0.9060371518135071, + "learning_rate": 4.8122035848590555e-06, + "loss": 0.731, + "step": 2105 + }, + { + "epoch": 0.7775043071621954, + "grad_norm": 0.8900934457778931, + "learning_rate": 4.81201859432107e-06, + "loss": 0.7363, + "step": 2106 + }, + { + "epoch": 0.7778734924932316, + "grad_norm": 0.8641214966773987, + "learning_rate": 4.8118335162739096e-06, + "loss": 0.6838, + "step": 2107 + }, + { + "epoch": 0.7782426778242678, + "grad_norm": 0.926879346370697, + "learning_rate": 4.81164835072458e-06, + "loss": 0.7187, + "step": 2108 + }, + { + "epoch": 0.778611863155304, + "grad_norm": 0.8715068697929382, + "learning_rate": 4.81146309768009e-06, + "loss": 0.7339, + "step": 2109 + }, + { + "epoch": 0.7789810484863401, + "grad_norm": 0.9065748453140259, + "learning_rate": 4.811277757147452e-06, + "loss": 0.7526, + "step": 2110 + }, + { + "epoch": 0.7793502338173763, + "grad_norm": 0.8723341822624207, + "learning_rate": 4.8110923291336804e-06, + "loss": 0.7001, + "step": 2111 + }, + { + "epoch": 0.7797194191484125, + "grad_norm": 0.8835813999176025, + "learning_rate": 4.810906813645793e-06, + "loss": 0.7187, + "step": 2112 + }, + { + "epoch": 0.7800886044794487, + "grad_norm": 0.8894395232200623, + "learning_rate": 4.810721210690812e-06, + "loss": 0.7349, + "step": 2113 + }, + { + "epoch": 0.7804577898104849, + "grad_norm": 0.8809918761253357, + "learning_rate": 4.8105355202757635e-06, + "loss": 0.6857, + "step": 2114 + }, + { + "epoch": 0.780826975141521, + "grad_norm": 0.8665323257446289, + "learning_rate": 4.810349742407673e-06, + "loss": 0.724, + "step": 2115 + }, + { + "epoch": 0.7811961604725572, + "grad_norm": 0.9003410935401917, + "learning_rate": 4.810163877093575e-06, + "loss": 0.6776, + "step": 2116 + }, + { + "epoch": 0.7815653458035934, + "grad_norm": 0.9015949368476868, + "learning_rate": 4.8099779243405035e-06, + "loss": 0.7107, + "step": 2117 + }, + { + "epoch": 0.7819345311346296, + "grad_norm": 0.9056869149208069, + "learning_rate": 4.8097918841554965e-06, + "loss": 0.7009, + "step": 2118 + }, + { + "epoch": 0.7823037164656658, + "grad_norm": 0.8645703792572021, + "learning_rate": 4.809605756545596e-06, + "loss": 0.7285, + "step": 2119 + }, + { + "epoch": 0.7826729017967019, + "grad_norm": 0.9308361411094666, + "learning_rate": 4.809419541517845e-06, + "loss": 0.7097, + "step": 2120 + }, + { + "epoch": 0.7830420871277382, + "grad_norm": 0.8967791795730591, + "learning_rate": 4.809233239079295e-06, + "loss": 0.7163, + "step": 2121 + }, + { + "epoch": 0.7834112724587743, + "grad_norm": 0.868552565574646, + "learning_rate": 4.809046849236995e-06, + "loss": 0.7397, + "step": 2122 + }, + { + "epoch": 0.7837804577898105, + "grad_norm": 0.894079864025116, + "learning_rate": 4.808860371998e-06, + "loss": 0.7136, + "step": 2123 + }, + { + "epoch": 0.7841496431208467, + "grad_norm": 0.874018132686615, + "learning_rate": 4.808673807369369e-06, + "loss": 0.6998, + "step": 2124 + }, + { + "epoch": 0.7845188284518828, + "grad_norm": 0.8726391792297363, + "learning_rate": 4.808487155358163e-06, + "loss": 0.7335, + "step": 2125 + }, + { + "epoch": 0.7848880137829191, + "grad_norm": 0.8656647205352783, + "learning_rate": 4.8083004159714465e-06, + "loss": 0.6756, + "step": 2126 + }, + { + "epoch": 0.7852571991139552, + "grad_norm": 0.8752986788749695, + "learning_rate": 4.808113589216288e-06, + "loss": 0.6846, + "step": 2127 + }, + { + "epoch": 0.7856263844449913, + "grad_norm": 0.8898423910140991, + "learning_rate": 4.807926675099759e-06, + "loss": 0.7056, + "step": 2128 + }, + { + "epoch": 0.7859955697760276, + "grad_norm": 0.9080005288124084, + "learning_rate": 4.807739673628934e-06, + "loss": 0.6836, + "step": 2129 + }, + { + "epoch": 0.7863647551070637, + "grad_norm": 0.9017002582550049, + "learning_rate": 4.8075525848108895e-06, + "loss": 0.7043, + "step": 2130 + }, + { + "epoch": 0.7867339404381, + "grad_norm": 0.9311098456382751, + "learning_rate": 4.807365408652709e-06, + "loss": 0.7493, + "step": 2131 + }, + { + "epoch": 0.7871031257691361, + "grad_norm": 1.0424463748931885, + "learning_rate": 4.807178145161475e-06, + "loss": 0.708, + "step": 2132 + }, + { + "epoch": 0.7874723111001722, + "grad_norm": 0.8803859949111938, + "learning_rate": 4.8069907943442775e-06, + "loss": 0.7011, + "step": 2133 + }, + { + "epoch": 0.7878414964312085, + "grad_norm": 0.8854016661643982, + "learning_rate": 4.806803356208205e-06, + "loss": 0.7101, + "step": 2134 + }, + { + "epoch": 0.7882106817622446, + "grad_norm": 0.8736090064048767, + "learning_rate": 4.806615830760355e-06, + "loss": 0.7102, + "step": 2135 + }, + { + "epoch": 0.7885798670932809, + "grad_norm": 0.8740845918655396, + "learning_rate": 4.806428218007823e-06, + "loss": 0.691, + "step": 2136 + }, + { + "epoch": 0.788949052424317, + "grad_norm": 0.9291470050811768, + "learning_rate": 4.806240517957711e-06, + "loss": 0.6972, + "step": 2137 + }, + { + "epoch": 0.7893182377553531, + "grad_norm": 0.8928626775741577, + "learning_rate": 4.8060527306171235e-06, + "loss": 0.6803, + "step": 2138 + }, + { + "epoch": 0.7896874230863894, + "grad_norm": 0.90341717004776, + "learning_rate": 4.805864855993167e-06, + "loss": 0.7155, + "step": 2139 + }, + { + "epoch": 0.7900566084174255, + "grad_norm": 0.8977493047714233, + "learning_rate": 4.805676894092956e-06, + "loss": 0.7205, + "step": 2140 + }, + { + "epoch": 0.7904257937484618, + "grad_norm": 0.8815012574195862, + "learning_rate": 4.805488844923599e-06, + "loss": 0.7137, + "step": 2141 + }, + { + "epoch": 0.7907949790794979, + "grad_norm": 0.8667290806770325, + "learning_rate": 4.8053007084922185e-06, + "loss": 0.7219, + "step": 2142 + }, + { + "epoch": 0.791164164410534, + "grad_norm": 0.8803372979164124, + "learning_rate": 4.805112484805934e-06, + "loss": 0.737, + "step": 2143 + }, + { + "epoch": 0.7915333497415703, + "grad_norm": 0.9153837561607361, + "learning_rate": 4.804924173871869e-06, + "loss": 0.7093, + "step": 2144 + }, + { + "epoch": 0.7919025350726064, + "grad_norm": 0.875203013420105, + "learning_rate": 4.804735775697151e-06, + "loss": 0.7147, + "step": 2145 + }, + { + "epoch": 0.7922717204036426, + "grad_norm": 0.8866360187530518, + "learning_rate": 4.8045472902889125e-06, + "loss": 0.7184, + "step": 2146 + }, + { + "epoch": 0.7926409057346788, + "grad_norm": 0.8125059604644775, + "learning_rate": 4.804358717654286e-06, + "loss": 0.6916, + "step": 2147 + }, + { + "epoch": 0.793010091065715, + "grad_norm": 0.8981543183326721, + "learning_rate": 4.804170057800408e-06, + "loss": 0.7109, + "step": 2148 + }, + { + "epoch": 0.7933792763967512, + "grad_norm": 0.841891348361969, + "learning_rate": 4.803981310734422e-06, + "loss": 0.6623, + "step": 2149 + }, + { + "epoch": 0.7937484617277873, + "grad_norm": 0.9186368584632874, + "learning_rate": 4.80379247646347e-06, + "loss": 0.7312, + "step": 2150 + }, + { + "epoch": 0.7941176470588235, + "grad_norm": 0.8802003860473633, + "learning_rate": 4.8036035549947e-06, + "loss": 0.7202, + "step": 2151 + }, + { + "epoch": 0.7944868323898597, + "grad_norm": 0.8542637825012207, + "learning_rate": 4.803414546335262e-06, + "loss": 0.6933, + "step": 2152 + }, + { + "epoch": 0.7948560177208959, + "grad_norm": 0.8632619976997375, + "learning_rate": 4.803225450492311e-06, + "loss": 0.7034, + "step": 2153 + }, + { + "epoch": 0.7952252030519321, + "grad_norm": 0.9003282785415649, + "learning_rate": 4.803036267473003e-06, + "loss": 0.707, + "step": 2154 + }, + { + "epoch": 0.7955943883829683, + "grad_norm": 0.8903799057006836, + "learning_rate": 4.802846997284499e-06, + "loss": 0.7226, + "step": 2155 + }, + { + "epoch": 0.7959635737140044, + "grad_norm": 0.8972636461257935, + "learning_rate": 4.802657639933964e-06, + "loss": 0.7459, + "step": 2156 + }, + { + "epoch": 0.7963327590450406, + "grad_norm": 0.8760960102081299, + "learning_rate": 4.802468195428563e-06, + "loss": 0.6983, + "step": 2157 + }, + { + "epoch": 0.7967019443760768, + "grad_norm": 0.8755945563316345, + "learning_rate": 4.802278663775468e-06, + "loss": 0.6942, + "step": 2158 + }, + { + "epoch": 0.797071129707113, + "grad_norm": 0.8775022625923157, + "learning_rate": 4.8020890449818524e-06, + "loss": 0.7312, + "step": 2159 + }, + { + "epoch": 0.7974403150381492, + "grad_norm": 0.8853294253349304, + "learning_rate": 4.801899339054893e-06, + "loss": 0.7129, + "step": 2160 + }, + { + "epoch": 0.7978095003691853, + "grad_norm": 0.8880541920661926, + "learning_rate": 4.801709546001769e-06, + "loss": 0.6857, + "step": 2161 + }, + { + "epoch": 0.7981786857002215, + "grad_norm": 0.9117274880409241, + "learning_rate": 4.801519665829666e-06, + "loss": 0.7093, + "step": 2162 + }, + { + "epoch": 0.7985478710312577, + "grad_norm": 0.8845729827880859, + "learning_rate": 4.8013296985457705e-06, + "loss": 0.7214, + "step": 2163 + }, + { + "epoch": 0.7989170563622938, + "grad_norm": 0.8805612325668335, + "learning_rate": 4.801139644157272e-06, + "loss": 0.6788, + "step": 2164 + }, + { + "epoch": 0.7992862416933301, + "grad_norm": 0.8353918790817261, + "learning_rate": 4.800949502671364e-06, + "loss": 0.6798, + "step": 2165 + }, + { + "epoch": 0.7996554270243662, + "grad_norm": 0.8719606995582581, + "learning_rate": 4.800759274095243e-06, + "loss": 0.6992, + "step": 2166 + }, + { + "epoch": 0.8000246123554025, + "grad_norm": 0.9136744141578674, + "learning_rate": 4.800568958436111e-06, + "loss": 0.7132, + "step": 2167 + }, + { + "epoch": 0.8003937976864386, + "grad_norm": 0.8668189644813538, + "learning_rate": 4.800378555701168e-06, + "loss": 0.686, + "step": 2168 + }, + { + "epoch": 0.8007629830174747, + "grad_norm": 0.8641545176506042, + "learning_rate": 4.800188065897624e-06, + "loss": 0.6975, + "step": 2169 + }, + { + "epoch": 0.801132168348511, + "grad_norm": 0.8470262289047241, + "learning_rate": 4.799997489032687e-06, + "loss": 0.7277, + "step": 2170 + }, + { + "epoch": 0.8015013536795471, + "grad_norm": 0.8653181791305542, + "learning_rate": 4.799806825113571e-06, + "loss": 0.7211, + "step": 2171 + }, + { + "epoch": 0.8018705390105834, + "grad_norm": 0.8891727328300476, + "learning_rate": 4.799616074147493e-06, + "loss": 0.7161, + "step": 2172 + }, + { + "epoch": 0.8022397243416195, + "grad_norm": 0.8755497336387634, + "learning_rate": 4.799425236141672e-06, + "loss": 0.694, + "step": 2173 + }, + { + "epoch": 0.8026089096726556, + "grad_norm": 0.8740184903144836, + "learning_rate": 4.7992343111033314e-06, + "loss": 0.6935, + "step": 2174 + }, + { + "epoch": 0.8029780950036919, + "grad_norm": 0.885025143623352, + "learning_rate": 4.7990432990396985e-06, + "loss": 0.6768, + "step": 2175 + }, + { + "epoch": 0.803347280334728, + "grad_norm": 0.9006462097167969, + "learning_rate": 4.798852199958002e-06, + "loss": 0.7212, + "step": 2176 + }, + { + "epoch": 0.8037164656657643, + "grad_norm": 0.8840904831886292, + "learning_rate": 4.798661013865475e-06, + "loss": 0.7266, + "step": 2177 + }, + { + "epoch": 0.8040856509968004, + "grad_norm": 0.8590693473815918, + "learning_rate": 4.798469740769354e-06, + "loss": 0.6822, + "step": 2178 + }, + { + "epoch": 0.8044548363278365, + "grad_norm": 0.8712232112884521, + "learning_rate": 4.798278380676879e-06, + "loss": 0.724, + "step": 2179 + }, + { + "epoch": 0.8048240216588728, + "grad_norm": 0.9065150618553162, + "learning_rate": 4.798086933595293e-06, + "loss": 0.6891, + "step": 2180 + }, + { + "epoch": 0.8051932069899089, + "grad_norm": 0.8752925395965576, + "learning_rate": 4.797895399531841e-06, + "loss": 0.71, + "step": 2181 + }, + { + "epoch": 0.8055623923209451, + "grad_norm": 0.8934815526008606, + "learning_rate": 4.797703778493774e-06, + "loss": 0.7422, + "step": 2182 + }, + { + "epoch": 0.8059315776519813, + "grad_norm": 0.8814444541931152, + "learning_rate": 4.7975120704883435e-06, + "loss": 0.7572, + "step": 2183 + }, + { + "epoch": 0.8063007629830174, + "grad_norm": 0.9211199879646301, + "learning_rate": 4.797320275522806e-06, + "loss": 0.7112, + "step": 2184 + }, + { + "epoch": 0.8066699483140537, + "grad_norm": 0.8932773470878601, + "learning_rate": 4.7971283936044226e-06, + "loss": 0.7328, + "step": 2185 + }, + { + "epoch": 0.8070391336450898, + "grad_norm": 0.8870787620544434, + "learning_rate": 4.796936424740454e-06, + "loss": 0.7048, + "step": 2186 + }, + { + "epoch": 0.807408318976126, + "grad_norm": 0.8720287680625916, + "learning_rate": 4.796744368938166e-06, + "loss": 0.6703, + "step": 2187 + }, + { + "epoch": 0.8077775043071622, + "grad_norm": 0.8679975271224976, + "learning_rate": 4.79655222620483e-06, + "loss": 0.7078, + "step": 2188 + }, + { + "epoch": 0.8081466896381984, + "grad_norm": 0.8599095940589905, + "learning_rate": 4.796359996547715e-06, + "loss": 0.6839, + "step": 2189 + }, + { + "epoch": 0.8085158749692346, + "grad_norm": 0.8417700529098511, + "learning_rate": 4.7961676799741e-06, + "loss": 0.6757, + "step": 2190 + }, + { + "epoch": 0.8088850603002707, + "grad_norm": 0.8529911637306213, + "learning_rate": 4.795975276491262e-06, + "loss": 0.7033, + "step": 2191 + }, + { + "epoch": 0.8092542456313069, + "grad_norm": 0.8772951364517212, + "learning_rate": 4.7957827861064855e-06, + "loss": 0.6562, + "step": 2192 + }, + { + "epoch": 0.8096234309623431, + "grad_norm": 0.9334204196929932, + "learning_rate": 4.795590208827054e-06, + "loss": 0.7505, + "step": 2193 + }, + { + "epoch": 0.8099926162933793, + "grad_norm": 0.8638662695884705, + "learning_rate": 4.795397544660258e-06, + "loss": 0.6816, + "step": 2194 + }, + { + "epoch": 0.8103618016244155, + "grad_norm": 0.9105616211891174, + "learning_rate": 4.795204793613391e-06, + "loss": 0.714, + "step": 2195 + }, + { + "epoch": 0.8107309869554516, + "grad_norm": 0.8942384719848633, + "learning_rate": 4.7950119556937455e-06, + "loss": 0.7327, + "step": 2196 + }, + { + "epoch": 0.8111001722864878, + "grad_norm": 0.8448975086212158, + "learning_rate": 4.794819030908622e-06, + "loss": 0.6941, + "step": 2197 + }, + { + "epoch": 0.811469357617524, + "grad_norm": 0.912215530872345, + "learning_rate": 4.7946260192653215e-06, + "loss": 0.7216, + "step": 2198 + }, + { + "epoch": 0.8118385429485602, + "grad_norm": 0.8788778185844421, + "learning_rate": 4.794432920771152e-06, + "loss": 0.7399, + "step": 2199 + }, + { + "epoch": 0.8122077282795963, + "grad_norm": 0.8956362009048462, + "learning_rate": 4.794239735433419e-06, + "loss": 0.7282, + "step": 2200 + }, + { + "epoch": 0.8125769136106326, + "grad_norm": 0.9069665670394897, + "learning_rate": 4.7940464632594376e-06, + "loss": 0.7298, + "step": 2201 + }, + { + "epoch": 0.8129460989416687, + "grad_norm": 0.8730249404907227, + "learning_rate": 4.793853104256521e-06, + "loss": 0.7101, + "step": 2202 + }, + { + "epoch": 0.8133152842727049, + "grad_norm": 1.0605006217956543, + "learning_rate": 4.793659658431988e-06, + "loss": 0.7115, + "step": 2203 + }, + { + "epoch": 0.8136844696037411, + "grad_norm": 0.9141219258308411, + "learning_rate": 4.793466125793161e-06, + "loss": 0.7554, + "step": 2204 + }, + { + "epoch": 0.8140536549347772, + "grad_norm": 0.8501049876213074, + "learning_rate": 4.793272506347365e-06, + "loss": 0.6839, + "step": 2205 + }, + { + "epoch": 0.8144228402658135, + "grad_norm": 0.8409802317619324, + "learning_rate": 4.793078800101929e-06, + "loss": 0.6518, + "step": 2206 + }, + { + "epoch": 0.8147920255968496, + "grad_norm": 0.8758774995803833, + "learning_rate": 4.792885007064183e-06, + "loss": 0.7349, + "step": 2207 + }, + { + "epoch": 0.8151612109278858, + "grad_norm": 0.8612200021743774, + "learning_rate": 4.792691127241463e-06, + "loss": 0.6952, + "step": 2208 + }, + { + "epoch": 0.815530396258922, + "grad_norm": 0.8541130423545837, + "learning_rate": 4.792497160641108e-06, + "loss": 0.6723, + "step": 2209 + }, + { + "epoch": 0.8158995815899581, + "grad_norm": 0.884846568107605, + "learning_rate": 4.7923031072704595e-06, + "loss": 0.7165, + "step": 2210 + }, + { + "epoch": 0.8162687669209944, + "grad_norm": 0.8823457956314087, + "learning_rate": 4.792108967136861e-06, + "loss": 0.7599, + "step": 2211 + }, + { + "epoch": 0.8166379522520305, + "grad_norm": 0.8571475744247437, + "learning_rate": 4.791914740247663e-06, + "loss": 0.7122, + "step": 2212 + }, + { + "epoch": 0.8170071375830666, + "grad_norm": 0.8436484336853027, + "learning_rate": 4.791720426610214e-06, + "loss": 0.6781, + "step": 2213 + }, + { + "epoch": 0.8173763229141029, + "grad_norm": 0.8832083344459534, + "learning_rate": 4.791526026231871e-06, + "loss": 0.7202, + "step": 2214 + }, + { + "epoch": 0.817745508245139, + "grad_norm": 0.9136930108070374, + "learning_rate": 4.79133153911999e-06, + "loss": 0.756, + "step": 2215 + }, + { + "epoch": 0.8181146935761753, + "grad_norm": 0.87180095911026, + "learning_rate": 4.791136965281934e-06, + "loss": 0.6874, + "step": 2216 + }, + { + "epoch": 0.8184838789072114, + "grad_norm": 0.8829808831214905, + "learning_rate": 4.790942304725067e-06, + "loss": 0.6974, + "step": 2217 + }, + { + "epoch": 0.8188530642382476, + "grad_norm": 0.8646160960197449, + "learning_rate": 4.790747557456757e-06, + "loss": 0.704, + "step": 2218 + }, + { + "epoch": 0.8192222495692838, + "grad_norm": 0.8445609211921692, + "learning_rate": 4.790552723484375e-06, + "loss": 0.7028, + "step": 2219 + }, + { + "epoch": 0.8195914349003199, + "grad_norm": 0.8569789528846741, + "learning_rate": 4.7903578028152946e-06, + "loss": 0.7304, + "step": 2220 + }, + { + "epoch": 0.8199606202313562, + "grad_norm": 0.8919884562492371, + "learning_rate": 4.790162795456895e-06, + "loss": 0.7303, + "step": 2221 + }, + { + "epoch": 0.8203298055623923, + "grad_norm": 0.8782804012298584, + "learning_rate": 4.789967701416556e-06, + "loss": 0.6676, + "step": 2222 + }, + { + "epoch": 0.8206989908934285, + "grad_norm": 0.8411016464233398, + "learning_rate": 4.789772520701662e-06, + "loss": 0.6856, + "step": 2223 + }, + { + "epoch": 0.8210681762244647, + "grad_norm": 0.8851430416107178, + "learning_rate": 4.7895772533196e-06, + "loss": 0.6995, + "step": 2224 + }, + { + "epoch": 0.8214373615555008, + "grad_norm": 0.8756270408630371, + "learning_rate": 4.789381899277763e-06, + "loss": 0.6872, + "step": 2225 + }, + { + "epoch": 0.8218065468865371, + "grad_norm": 0.8848443031311035, + "learning_rate": 4.7891864585835435e-06, + "loss": 0.7128, + "step": 2226 + }, + { + "epoch": 0.8221757322175732, + "grad_norm": 0.8853998780250549, + "learning_rate": 4.788990931244338e-06, + "loss": 0.7123, + "step": 2227 + }, + { + "epoch": 0.8225449175486094, + "grad_norm": 0.8603769540786743, + "learning_rate": 4.788795317267549e-06, + "loss": 0.7094, + "step": 2228 + }, + { + "epoch": 0.8229141028796456, + "grad_norm": 0.8783960342407227, + "learning_rate": 4.7885996166605795e-06, + "loss": 0.6871, + "step": 2229 + }, + { + "epoch": 0.8232832882106818, + "grad_norm": 0.9068373441696167, + "learning_rate": 4.788403829430837e-06, + "loss": 0.7508, + "step": 2230 + }, + { + "epoch": 0.8236524735417179, + "grad_norm": 0.9134407043457031, + "learning_rate": 4.788207955585732e-06, + "loss": 0.685, + "step": 2231 + }, + { + "epoch": 0.8240216588727541, + "grad_norm": 0.8849846720695496, + "learning_rate": 4.788011995132678e-06, + "loss": 0.6984, + "step": 2232 + }, + { + "epoch": 0.8243908442037903, + "grad_norm": 0.8880661725997925, + "learning_rate": 4.787815948079092e-06, + "loss": 0.6916, + "step": 2233 + }, + { + "epoch": 0.8247600295348265, + "grad_norm": 0.8438676595687866, + "learning_rate": 4.787619814432394e-06, + "loss": 0.6781, + "step": 2234 + }, + { + "epoch": 0.8251292148658627, + "grad_norm": 0.8555399775505066, + "learning_rate": 4.787423594200007e-06, + "loss": 0.7133, + "step": 2235 + }, + { + "epoch": 0.8254984001968988, + "grad_norm": 0.8856319189071655, + "learning_rate": 4.787227287389361e-06, + "loss": 0.7102, + "step": 2236 + }, + { + "epoch": 0.825867585527935, + "grad_norm": 0.8559306859970093, + "learning_rate": 4.787030894007882e-06, + "loss": 0.6828, + "step": 2237 + }, + { + "epoch": 0.8262367708589712, + "grad_norm": 0.8558578491210938, + "learning_rate": 4.7868344140630076e-06, + "loss": 0.692, + "step": 2238 + }, + { + "epoch": 0.8266059561900074, + "grad_norm": 0.8673276901245117, + "learning_rate": 4.786637847562171e-06, + "loss": 0.7351, + "step": 2239 + }, + { + "epoch": 0.8269751415210436, + "grad_norm": 0.88542640209198, + "learning_rate": 4.786441194512814e-06, + "loss": 0.7116, + "step": 2240 + }, + { + "epoch": 0.8273443268520797, + "grad_norm": 0.8700323700904846, + "learning_rate": 4.786244454922379e-06, + "loss": 0.707, + "step": 2241 + }, + { + "epoch": 0.827713512183116, + "grad_norm": 0.8809252977371216, + "learning_rate": 4.7860476287983124e-06, + "loss": 0.715, + "step": 2242 + }, + { + "epoch": 0.8280826975141521, + "grad_norm": 0.9027058482170105, + "learning_rate": 4.785850716148066e-06, + "loss": 0.7354, + "step": 2243 + }, + { + "epoch": 0.8284518828451883, + "grad_norm": 0.8742669224739075, + "learning_rate": 4.7856537169790905e-06, + "loss": 0.6748, + "step": 2244 + }, + { + "epoch": 0.8288210681762245, + "grad_norm": 0.8710356950759888, + "learning_rate": 4.7854566312988435e-06, + "loss": 0.7133, + "step": 2245 + }, + { + "epoch": 0.8291902535072606, + "grad_norm": 0.9114298224449158, + "learning_rate": 4.785259459114784e-06, + "loss": 0.7176, + "step": 2246 + }, + { + "epoch": 0.8295594388382969, + "grad_norm": 0.8762310147285461, + "learning_rate": 4.785062200434376e-06, + "loss": 0.7265, + "step": 2247 + }, + { + "epoch": 0.829928624169333, + "grad_norm": 0.8757422566413879, + "learning_rate": 4.784864855265083e-06, + "loss": 0.6755, + "step": 2248 + }, + { + "epoch": 0.8302978095003691, + "grad_norm": 0.8273042440414429, + "learning_rate": 4.784667423614379e-06, + "loss": 0.6903, + "step": 2249 + }, + { + "epoch": 0.8306669948314054, + "grad_norm": 0.911002516746521, + "learning_rate": 4.784469905489732e-06, + "loss": 0.7196, + "step": 2250 + }, + { + "epoch": 0.8310361801624415, + "grad_norm": 0.8729998469352722, + "learning_rate": 4.784272300898621e-06, + "loss": 0.727, + "step": 2251 + }, + { + "epoch": 0.8314053654934778, + "grad_norm": 0.897255003452301, + "learning_rate": 4.784074609848524e-06, + "loss": 0.7145, + "step": 2252 + }, + { + "epoch": 0.8317745508245139, + "grad_norm": 0.8995161056518555, + "learning_rate": 4.783876832346924e-06, + "loss": 0.6983, + "step": 2253 + }, + { + "epoch": 0.83214373615555, + "grad_norm": 0.8717133402824402, + "learning_rate": 4.783678968401306e-06, + "loss": 0.7164, + "step": 2254 + }, + { + "epoch": 0.8325129214865863, + "grad_norm": 0.8860628604888916, + "learning_rate": 4.783481018019161e-06, + "loss": 0.6891, + "step": 2255 + }, + { + "epoch": 0.8328821068176224, + "grad_norm": 0.8644296526908875, + "learning_rate": 4.783282981207979e-06, + "loss": 0.7449, + "step": 2256 + }, + { + "epoch": 0.8332512921486587, + "grad_norm": 0.8703963160514832, + "learning_rate": 4.783084857975258e-06, + "loss": 0.7067, + "step": 2257 + }, + { + "epoch": 0.8336204774796948, + "grad_norm": 0.8384194374084473, + "learning_rate": 4.782886648328495e-06, + "loss": 0.6811, + "step": 2258 + }, + { + "epoch": 0.833989662810731, + "grad_norm": 0.8807311654090881, + "learning_rate": 4.7826883522751934e-06, + "loss": 0.7016, + "step": 2259 + }, + { + "epoch": 0.8343588481417672, + "grad_norm": 0.8632846474647522, + "learning_rate": 4.782489969822857e-06, + "loss": 0.6956, + "step": 2260 + }, + { + "epoch": 0.8347280334728033, + "grad_norm": 0.852558434009552, + "learning_rate": 4.7822915009789965e-06, + "loss": 0.7076, + "step": 2261 + }, + { + "epoch": 0.8350972188038396, + "grad_norm": 0.8728243708610535, + "learning_rate": 4.782092945751122e-06, + "loss": 0.6962, + "step": 2262 + }, + { + "epoch": 0.8354664041348757, + "grad_norm": 0.8713400363922119, + "learning_rate": 4.781894304146751e-06, + "loss": 0.7163, + "step": 2263 + }, + { + "epoch": 0.8358355894659119, + "grad_norm": 0.893301784992218, + "learning_rate": 4.7816955761734e-06, + "loss": 0.7268, + "step": 2264 + }, + { + "epoch": 0.8362047747969481, + "grad_norm": 0.854168713092804, + "learning_rate": 4.781496761838592e-06, + "loss": 0.6719, + "step": 2265 + }, + { + "epoch": 0.8365739601279842, + "grad_norm": 0.8409112095832825, + "learning_rate": 4.781297861149852e-06, + "loss": 0.7148, + "step": 2266 + }, + { + "epoch": 0.8369431454590204, + "grad_norm": 0.8801021575927734, + "learning_rate": 4.781098874114707e-06, + "loss": 0.7365, + "step": 2267 + }, + { + "epoch": 0.8373123307900566, + "grad_norm": 0.849139392375946, + "learning_rate": 4.780899800740689e-06, + "loss": 0.7219, + "step": 2268 + }, + { + "epoch": 0.8376815161210928, + "grad_norm": 0.8867851495742798, + "learning_rate": 4.780700641035335e-06, + "loss": 0.7369, + "step": 2269 + }, + { + "epoch": 0.838050701452129, + "grad_norm": 0.877516508102417, + "learning_rate": 4.780501395006181e-06, + "loss": 0.7083, + "step": 2270 + }, + { + "epoch": 0.8384198867831651, + "grad_norm": 0.8607859015464783, + "learning_rate": 4.7803020626607686e-06, + "loss": 0.6921, + "step": 2271 + }, + { + "epoch": 0.8387890721142013, + "grad_norm": 0.8483718633651733, + "learning_rate": 4.7801026440066425e-06, + "loss": 0.6989, + "step": 2272 + }, + { + "epoch": 0.8391582574452375, + "grad_norm": 0.8639686107635498, + "learning_rate": 4.779903139051352e-06, + "loss": 0.6932, + "step": 2273 + }, + { + "epoch": 0.8395274427762737, + "grad_norm": 0.8545430302619934, + "learning_rate": 4.779703547802446e-06, + "loss": 0.7133, + "step": 2274 + }, + { + "epoch": 0.8398966281073099, + "grad_norm": 0.8846385478973389, + "learning_rate": 4.7795038702674816e-06, + "loss": 0.6923, + "step": 2275 + }, + { + "epoch": 0.840265813438346, + "grad_norm": 0.8536352515220642, + "learning_rate": 4.7793041064540135e-06, + "loss": 0.6885, + "step": 2276 + }, + { + "epoch": 0.8406349987693822, + "grad_norm": 0.8723476529121399, + "learning_rate": 4.779104256369605e-06, + "loss": 0.7001, + "step": 2277 + }, + { + "epoch": 0.8410041841004184, + "grad_norm": 0.8628838062286377, + "learning_rate": 4.77890432002182e-06, + "loss": 0.6879, + "step": 2278 + }, + { + "epoch": 0.8413733694314546, + "grad_norm": 0.8553763031959534, + "learning_rate": 4.778704297418226e-06, + "loss": 0.6245, + "step": 2279 + }, + { + "epoch": 0.8417425547624908, + "grad_norm": 0.9173882603645325, + "learning_rate": 4.778504188566393e-06, + "loss": 0.7236, + "step": 2280 + }, + { + "epoch": 0.842111740093527, + "grad_norm": 0.8579378128051758, + "learning_rate": 4.7783039934738955e-06, + "loss": 0.7327, + "step": 2281 + }, + { + "epoch": 0.8424809254245631, + "grad_norm": 0.8806033134460449, + "learning_rate": 4.778103712148311e-06, + "loss": 0.7236, + "step": 2282 + }, + { + "epoch": 0.8428501107555993, + "grad_norm": 0.8968467116355896, + "learning_rate": 4.77790334459722e-06, + "loss": 0.744, + "step": 2283 + }, + { + "epoch": 0.8432192960866355, + "grad_norm": 0.8785833120346069, + "learning_rate": 4.777702890828206e-06, + "loss": 0.6963, + "step": 2284 + }, + { + "epoch": 0.8435884814176716, + "grad_norm": 0.8928925395011902, + "learning_rate": 4.777502350848857e-06, + "loss": 0.7089, + "step": 2285 + }, + { + "epoch": 0.8439576667487079, + "grad_norm": 0.8602617383003235, + "learning_rate": 4.777301724666763e-06, + "loss": 0.7025, + "step": 2286 + }, + { + "epoch": 0.844326852079744, + "grad_norm": 0.9113879799842834, + "learning_rate": 4.777101012289517e-06, + "loss": 0.6959, + "step": 2287 + }, + { + "epoch": 0.8446960374107803, + "grad_norm": 0.8733735680580139, + "learning_rate": 4.776900213724717e-06, + "loss": 0.7307, + "step": 2288 + }, + { + "epoch": 0.8450652227418164, + "grad_norm": 0.8886083364486694, + "learning_rate": 4.776699328979961e-06, + "loss": 0.7134, + "step": 2289 + }, + { + "epoch": 0.8454344080728525, + "grad_norm": 0.9018360376358032, + "learning_rate": 4.776498358062855e-06, + "loss": 0.7246, + "step": 2290 + }, + { + "epoch": 0.8458035934038888, + "grad_norm": 0.9006841778755188, + "learning_rate": 4.776297300981005e-06, + "loss": 0.7188, + "step": 2291 + }, + { + "epoch": 0.8461727787349249, + "grad_norm": 0.8728944063186646, + "learning_rate": 4.77609615774202e-06, + "loss": 0.685, + "step": 2292 + }, + { + "epoch": 0.8465419640659612, + "grad_norm": 0.8995937705039978, + "learning_rate": 4.775894928353514e-06, + "loss": 0.6868, + "step": 2293 + }, + { + "epoch": 0.8469111493969973, + "grad_norm": 0.8754889369010925, + "learning_rate": 4.7756936128231026e-06, + "loss": 0.7013, + "step": 2294 + }, + { + "epoch": 0.8472803347280334, + "grad_norm": 0.8546683192253113, + "learning_rate": 4.775492211158407e-06, + "loss": 0.6874, + "step": 2295 + }, + { + "epoch": 0.8476495200590697, + "grad_norm": 0.8704326152801514, + "learning_rate": 4.775290723367048e-06, + "loss": 0.7295, + "step": 2296 + }, + { + "epoch": 0.8480187053901058, + "grad_norm": 0.9398274421691895, + "learning_rate": 4.7750891494566555e-06, + "loss": 0.7252, + "step": 2297 + }, + { + "epoch": 0.8483878907211421, + "grad_norm": 0.8432691097259521, + "learning_rate": 4.774887489434855e-06, + "loss": 0.6964, + "step": 2298 + }, + { + "epoch": 0.8487570760521782, + "grad_norm": 0.8544699549674988, + "learning_rate": 4.774685743309282e-06, + "loss": 0.6654, + "step": 2299 + }, + { + "epoch": 0.8491262613832143, + "grad_norm": 0.8489146828651428, + "learning_rate": 4.774483911087571e-06, + "loss": 0.7227, + "step": 2300 + }, + { + "epoch": 0.8494954467142506, + "grad_norm": 0.882328987121582, + "learning_rate": 4.774281992777361e-06, + "loss": 0.7194, + "step": 2301 + }, + { + "epoch": 0.8498646320452867, + "grad_norm": 0.8312026858329773, + "learning_rate": 4.7740799883862966e-06, + "loss": 0.6877, + "step": 2302 + }, + { + "epoch": 0.8502338173763229, + "grad_norm": 0.8677752017974854, + "learning_rate": 4.7738778979220215e-06, + "loss": 0.7522, + "step": 2303 + }, + { + "epoch": 0.8506030027073591, + "grad_norm": 0.8952152729034424, + "learning_rate": 4.773675721392186e-06, + "loss": 0.7023, + "step": 2304 + }, + { + "epoch": 0.8509721880383952, + "grad_norm": 0.8487045764923096, + "learning_rate": 4.773473458804442e-06, + "loss": 0.6788, + "step": 2305 + }, + { + "epoch": 0.8513413733694315, + "grad_norm": 0.8884228467941284, + "learning_rate": 4.7732711101664455e-06, + "loss": 0.6975, + "step": 2306 + }, + { + "epoch": 0.8517105587004676, + "grad_norm": 0.8592703342437744, + "learning_rate": 4.773068675485854e-06, + "loss": 0.6663, + "step": 2307 + }, + { + "epoch": 0.8520797440315038, + "grad_norm": 0.8684633374214172, + "learning_rate": 4.772866154770331e-06, + "loss": 0.7363, + "step": 2308 + }, + { + "epoch": 0.85244892936254, + "grad_norm": 0.8979021906852722, + "learning_rate": 4.772663548027542e-06, + "loss": 0.7207, + "step": 2309 + }, + { + "epoch": 0.8528181146935762, + "grad_norm": 0.8448242545127869, + "learning_rate": 4.772460855265154e-06, + "loss": 0.6676, + "step": 2310 + }, + { + "epoch": 0.8531873000246124, + "grad_norm": 0.8736171722412109, + "learning_rate": 4.77225807649084e-06, + "loss": 0.6814, + "step": 2311 + }, + { + "epoch": 0.8535564853556485, + "grad_norm": 0.908427357673645, + "learning_rate": 4.772055211712276e-06, + "loss": 0.7004, + "step": 2312 + }, + { + "epoch": 0.8539256706866847, + "grad_norm": 0.9255645275115967, + "learning_rate": 4.771852260937138e-06, + "loss": 0.7046, + "step": 2313 + }, + { + "epoch": 0.8542948560177209, + "grad_norm": 0.8731818795204163, + "learning_rate": 4.771649224173109e-06, + "loss": 0.6838, + "step": 2314 + }, + { + "epoch": 0.8546640413487571, + "grad_norm": 0.9106831550598145, + "learning_rate": 4.7714461014278745e-06, + "loss": 0.6892, + "step": 2315 + }, + { + "epoch": 0.8550332266797932, + "grad_norm": 0.9361001253128052, + "learning_rate": 4.771242892709121e-06, + "loss": 0.7194, + "step": 2316 + }, + { + "epoch": 0.8554024120108294, + "grad_norm": 0.9008825421333313, + "learning_rate": 4.771039598024542e-06, + "loss": 0.7086, + "step": 2317 + }, + { + "epoch": 0.8557715973418656, + "grad_norm": 0.8849498629570007, + "learning_rate": 4.77083621738183e-06, + "loss": 0.7, + "step": 2318 + }, + { + "epoch": 0.8561407826729018, + "grad_norm": 0.9198765158653259, + "learning_rate": 4.770632750788685e-06, + "loss": 0.7492, + "step": 2319 + }, + { + "epoch": 0.856509968003938, + "grad_norm": 0.8527234196662903, + "learning_rate": 4.770429198252806e-06, + "loss": 0.7013, + "step": 2320 + }, + { + "epoch": 0.8568791533349741, + "grad_norm": 0.8671658635139465, + "learning_rate": 4.770225559781899e-06, + "loss": 0.6557, + "step": 2321 + }, + { + "epoch": 0.8572483386660104, + "grad_norm": 0.9001504778862, + "learning_rate": 4.77002183538367e-06, + "loss": 0.7024, + "step": 2322 + }, + { + "epoch": 0.8576175239970465, + "grad_norm": 0.8563042879104614, + "learning_rate": 4.769818025065832e-06, + "loss": 0.6986, + "step": 2323 + }, + { + "epoch": 0.8579867093280827, + "grad_norm": 0.8947487473487854, + "learning_rate": 4.769614128836098e-06, + "loss": 0.696, + "step": 2324 + }, + { + "epoch": 0.8583558946591189, + "grad_norm": 0.8475277423858643, + "learning_rate": 4.769410146702186e-06, + "loss": 0.6987, + "step": 2325 + }, + { + "epoch": 0.858725079990155, + "grad_norm": 0.8660178184509277, + "learning_rate": 4.769206078671815e-06, + "loss": 0.7162, + "step": 2326 + }, + { + "epoch": 0.8590942653211913, + "grad_norm": 0.9072052836418152, + "learning_rate": 4.769001924752711e-06, + "loss": 0.7051, + "step": 2327 + }, + { + "epoch": 0.8594634506522274, + "grad_norm": 0.893517255783081, + "learning_rate": 4.7687976849526e-06, + "loss": 0.6847, + "step": 2328 + }, + { + "epoch": 0.8598326359832636, + "grad_norm": 0.8827802538871765, + "learning_rate": 4.768593359279212e-06, + "loss": 0.732, + "step": 2329 + }, + { + "epoch": 0.8602018213142998, + "grad_norm": 0.8519994616508484, + "learning_rate": 4.768388947740282e-06, + "loss": 0.7077, + "step": 2330 + }, + { + "epoch": 0.8605710066453359, + "grad_norm": 0.8811690211296082, + "learning_rate": 4.768184450343546e-06, + "loss": 0.724, + "step": 2331 + }, + { + "epoch": 0.8609401919763722, + "grad_norm": 0.8675876259803772, + "learning_rate": 4.7679798670967446e-06, + "loss": 0.723, + "step": 2332 + }, + { + "epoch": 0.8613093773074083, + "grad_norm": 0.8784676194190979, + "learning_rate": 4.767775198007621e-06, + "loss": 0.7162, + "step": 2333 + }, + { + "epoch": 0.8616785626384444, + "grad_norm": 0.904371440410614, + "learning_rate": 4.767570443083922e-06, + "loss": 0.746, + "step": 2334 + }, + { + "epoch": 0.8620477479694807, + "grad_norm": 0.9030969738960266, + "learning_rate": 4.767365602333397e-06, + "loss": 0.7272, + "step": 2335 + }, + { + "epoch": 0.8624169333005168, + "grad_norm": 0.9071660041809082, + "learning_rate": 4.7671606757638e-06, + "loss": 0.7358, + "step": 2336 + }, + { + "epoch": 0.8627861186315531, + "grad_norm": 0.8928775191307068, + "learning_rate": 4.766955663382887e-06, + "loss": 0.7025, + "step": 2337 + }, + { + "epoch": 0.8631553039625892, + "grad_norm": 0.8183935880661011, + "learning_rate": 4.766750565198417e-06, + "loss": 0.6496, + "step": 2338 + }, + { + "epoch": 0.8635244892936254, + "grad_norm": 0.9024012088775635, + "learning_rate": 4.7665453812181535e-06, + "loss": 0.7378, + "step": 2339 + }, + { + "epoch": 0.8638936746246616, + "grad_norm": 0.8578570485115051, + "learning_rate": 4.766340111449863e-06, + "loss": 0.7127, + "step": 2340 + }, + { + "epoch": 0.8642628599556977, + "grad_norm": 0.8736898303031921, + "learning_rate": 4.766134755901315e-06, + "loss": 0.7021, + "step": 2341 + }, + { + "epoch": 0.864632045286734, + "grad_norm": 0.8850502371788025, + "learning_rate": 4.765929314580281e-06, + "loss": 0.7092, + "step": 2342 + }, + { + "epoch": 0.8650012306177701, + "grad_norm": 0.8683915734291077, + "learning_rate": 4.765723787494538e-06, + "loss": 0.6966, + "step": 2343 + }, + { + "epoch": 0.8653704159488063, + "grad_norm": 0.8678285479545593, + "learning_rate": 4.765518174651864e-06, + "loss": 0.6836, + "step": 2344 + }, + { + "epoch": 0.8657396012798425, + "grad_norm": 0.8723390698432922, + "learning_rate": 4.7653124760600435e-06, + "loss": 0.6934, + "step": 2345 + }, + { + "epoch": 0.8661087866108786, + "grad_norm": 0.8977248072624207, + "learning_rate": 4.7651066917268595e-06, + "loss": 0.6904, + "step": 2346 + }, + { + "epoch": 0.8664779719419149, + "grad_norm": 0.8875908255577087, + "learning_rate": 4.764900821660102e-06, + "loss": 0.7268, + "step": 2347 + }, + { + "epoch": 0.866847157272951, + "grad_norm": 0.8535262942314148, + "learning_rate": 4.764694865867564e-06, + "loss": 0.6876, + "step": 2348 + }, + { + "epoch": 0.8672163426039872, + "grad_norm": 0.8672966361045837, + "learning_rate": 4.76448882435704e-06, + "loss": 0.7274, + "step": 2349 + }, + { + "epoch": 0.8675855279350234, + "grad_norm": 0.8733333945274353, + "learning_rate": 4.764282697136328e-06, + "loss": 0.7304, + "step": 2350 + }, + { + "epoch": 0.8679547132660596, + "grad_norm": 0.8675313591957092, + "learning_rate": 4.764076484213232e-06, + "loss": 0.73, + "step": 2351 + }, + { + "epoch": 0.8683238985970957, + "grad_norm": 0.9445770382881165, + "learning_rate": 4.763870185595554e-06, + "loss": 0.7039, + "step": 2352 + }, + { + "epoch": 0.8686930839281319, + "grad_norm": 0.8547798991203308, + "learning_rate": 4.763663801291104e-06, + "loss": 0.6863, + "step": 2353 + }, + { + "epoch": 0.8690622692591681, + "grad_norm": 0.8876418471336365, + "learning_rate": 4.763457331307695e-06, + "loss": 0.7051, + "step": 2354 + }, + { + "epoch": 0.8694314545902043, + "grad_norm": 0.8787064552307129, + "learning_rate": 4.763250775653139e-06, + "loss": 0.667, + "step": 2355 + }, + { + "epoch": 0.8698006399212405, + "grad_norm": 0.8645234704017639, + "learning_rate": 4.763044134335256e-06, + "loss": 0.7211, + "step": 2356 + }, + { + "epoch": 0.8701698252522766, + "grad_norm": 0.8852728605270386, + "learning_rate": 4.762837407361866e-06, + "loss": 0.6929, + "step": 2357 + }, + { + "epoch": 0.8705390105833128, + "grad_norm": 0.8681321144104004, + "learning_rate": 4.7626305947407944e-06, + "loss": 0.6783, + "step": 2358 + }, + { + "epoch": 0.870908195914349, + "grad_norm": 0.8449599146842957, + "learning_rate": 4.7624236964798695e-06, + "loss": 0.6916, + "step": 2359 + }, + { + "epoch": 0.8712773812453852, + "grad_norm": 0.8922649621963501, + "learning_rate": 4.762216712586922e-06, + "loss": 0.7218, + "step": 2360 + }, + { + "epoch": 0.8716465665764214, + "grad_norm": 0.8748049736022949, + "learning_rate": 4.762009643069786e-06, + "loss": 0.705, + "step": 2361 + }, + { + "epoch": 0.8720157519074575, + "grad_norm": 0.90827476978302, + "learning_rate": 4.761802487936298e-06, + "loss": 0.7234, + "step": 2362 + }, + { + "epoch": 0.8723849372384938, + "grad_norm": 0.8977128863334656, + "learning_rate": 4.7615952471943006e-06, + "loss": 0.7423, + "step": 2363 + }, + { + "epoch": 0.8727541225695299, + "grad_norm": 0.8443285822868347, + "learning_rate": 4.761387920851636e-06, + "loss": 0.6924, + "step": 2364 + }, + { + "epoch": 0.8731233079005661, + "grad_norm": 0.8841611742973328, + "learning_rate": 4.761180508916152e-06, + "loss": 0.6902, + "step": 2365 + }, + { + "epoch": 0.8734924932316023, + "grad_norm": 0.8554913401603699, + "learning_rate": 4.760973011395701e-06, + "loss": 0.6946, + "step": 2366 + }, + { + "epoch": 0.8738616785626384, + "grad_norm": 0.8738844990730286, + "learning_rate": 4.760765428298134e-06, + "loss": 0.6728, + "step": 2367 + }, + { + "epoch": 0.8742308638936747, + "grad_norm": 0.8898583650588989, + "learning_rate": 4.760557759631309e-06, + "loss": 0.7027, + "step": 2368 + }, + { + "epoch": 0.8746000492247108, + "grad_norm": 0.8664471507072449, + "learning_rate": 4.760350005403086e-06, + "loss": 0.7114, + "step": 2369 + }, + { + "epoch": 0.8749692345557469, + "grad_norm": 0.902643084526062, + "learning_rate": 4.76014216562133e-06, + "loss": 0.7317, + "step": 2370 + }, + { + "epoch": 0.8753384198867832, + "grad_norm": 0.8991020917892456, + "learning_rate": 4.759934240293906e-06, + "loss": 0.7036, + "step": 2371 + }, + { + "epoch": 0.8757076052178193, + "grad_norm": 0.8848997950553894, + "learning_rate": 4.759726229428683e-06, + "loss": 0.7416, + "step": 2372 + }, + { + "epoch": 0.8760767905488556, + "grad_norm": 0.8591799736022949, + "learning_rate": 4.759518133033536e-06, + "loss": 0.6713, + "step": 2373 + }, + { + "epoch": 0.8764459758798917, + "grad_norm": 0.8584043979644775, + "learning_rate": 4.7593099511163405e-06, + "loss": 0.6793, + "step": 2374 + }, + { + "epoch": 0.8768151612109278, + "grad_norm": 0.8347330689430237, + "learning_rate": 4.759101683684977e-06, + "loss": 0.6585, + "step": 2375 + }, + { + "epoch": 0.8771843465419641, + "grad_norm": 0.8343052864074707, + "learning_rate": 4.7588933307473275e-06, + "loss": 0.6982, + "step": 2376 + }, + { + "epoch": 0.8775535318730002, + "grad_norm": 0.8740595579147339, + "learning_rate": 4.758684892311278e-06, + "loss": 0.6836, + "step": 2377 + }, + { + "epoch": 0.8779227172040365, + "grad_norm": 0.8705688118934631, + "learning_rate": 4.758476368384719e-06, + "loss": 0.6859, + "step": 2378 + }, + { + "epoch": 0.8782919025350726, + "grad_norm": 1.0071030855178833, + "learning_rate": 4.758267758975541e-06, + "loss": 0.7224, + "step": 2379 + }, + { + "epoch": 0.8786610878661087, + "grad_norm": 0.8606764078140259, + "learning_rate": 4.758059064091642e-06, + "loss": 0.6912, + "step": 2380 + }, + { + "epoch": 0.879030273197145, + "grad_norm": 0.9006412625312805, + "learning_rate": 4.75785028374092e-06, + "loss": 0.7044, + "step": 2381 + }, + { + "epoch": 0.8793994585281811, + "grad_norm": 0.8861314058303833, + "learning_rate": 4.757641417931278e-06, + "loss": 0.6825, + "step": 2382 + }, + { + "epoch": 0.8797686438592174, + "grad_norm": 0.8772760033607483, + "learning_rate": 4.75743246667062e-06, + "loss": 0.6987, + "step": 2383 + }, + { + "epoch": 0.8801378291902535, + "grad_norm": 0.8669841289520264, + "learning_rate": 4.757223429966855e-06, + "loss": 0.6761, + "step": 2384 + }, + { + "epoch": 0.8805070145212897, + "grad_norm": 0.8585817813873291, + "learning_rate": 4.757014307827897e-06, + "loss": 0.6793, + "step": 2385 + }, + { + "epoch": 0.8808761998523259, + "grad_norm": 0.8737130165100098, + "learning_rate": 4.756805100261658e-06, + "loss": 0.6956, + "step": 2386 + }, + { + "epoch": 0.881245385183362, + "grad_norm": 0.8613405823707581, + "learning_rate": 4.75659580727606e-06, + "loss": 0.6852, + "step": 2387 + }, + { + "epoch": 0.8816145705143982, + "grad_norm": 0.8970152735710144, + "learning_rate": 4.756386428879022e-06, + "loss": 0.7446, + "step": 2388 + }, + { + "epoch": 0.8819837558454344, + "grad_norm": 0.8712779879570007, + "learning_rate": 4.75617696507847e-06, + "loss": 0.7216, + "step": 2389 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.8699203133583069, + "learning_rate": 4.755967415882331e-06, + "loss": 0.7225, + "step": 2390 + }, + { + "epoch": 0.8827221265075068, + "grad_norm": 0.8482356071472168, + "learning_rate": 4.755757781298539e-06, + "loss": 0.7013, + "step": 2391 + }, + { + "epoch": 0.883091311838543, + "grad_norm": 0.8718639016151428, + "learning_rate": 4.7555480613350255e-06, + "loss": 0.7017, + "step": 2392 + }, + { + "epoch": 0.8834604971695791, + "grad_norm": 0.886998176574707, + "learning_rate": 4.7553382559997305e-06, + "loss": 0.7333, + "step": 2393 + }, + { + "epoch": 0.8838296825006153, + "grad_norm": 0.9135888814926147, + "learning_rate": 4.755128365300594e-06, + "loss": 0.7033, + "step": 2394 + }, + { + "epoch": 0.8841988678316515, + "grad_norm": 0.8773205280303955, + "learning_rate": 4.7549183892455605e-06, + "loss": 0.71, + "step": 2395 + }, + { + "epoch": 0.8845680531626877, + "grad_norm": 0.8733408451080322, + "learning_rate": 4.754708327842579e-06, + "loss": 0.6728, + "step": 2396 + }, + { + "epoch": 0.8849372384937239, + "grad_norm": 0.86775141954422, + "learning_rate": 4.7544981810995976e-06, + "loss": 0.6991, + "step": 2397 + }, + { + "epoch": 0.88530642382476, + "grad_norm": 0.8773441314697266, + "learning_rate": 4.7542879490245705e-06, + "loss": 0.7403, + "step": 2398 + }, + { + "epoch": 0.8856756091557962, + "grad_norm": 0.8608452677726746, + "learning_rate": 4.754077631625457e-06, + "loss": 0.6506, + "step": 2399 + }, + { + "epoch": 0.8860447944868324, + "grad_norm": 0.866370677947998, + "learning_rate": 4.753867228910217e-06, + "loss": 0.7008, + "step": 2400 + }, + { + "epoch": 0.8864139798178686, + "grad_norm": 0.8670253753662109, + "learning_rate": 4.753656740886814e-06, + "loss": 0.7363, + "step": 2401 + }, + { + "epoch": 0.8867831651489048, + "grad_norm": 0.8610836267471313, + "learning_rate": 4.753446167563214e-06, + "loss": 0.6961, + "step": 2402 + }, + { + "epoch": 0.8871523504799409, + "grad_norm": 0.8401498198509216, + "learning_rate": 4.753235508947388e-06, + "loss": 0.6867, + "step": 2403 + }, + { + "epoch": 0.8875215358109771, + "grad_norm": 0.8600946068763733, + "learning_rate": 4.753024765047309e-06, + "loss": 0.6911, + "step": 2404 + }, + { + "epoch": 0.8878907211420133, + "grad_norm": 0.8780009150505066, + "learning_rate": 4.752813935870954e-06, + "loss": 0.7171, + "step": 2405 + }, + { + "epoch": 0.8882599064730494, + "grad_norm": 0.8487856984138489, + "learning_rate": 4.752603021426302e-06, + "loss": 0.6779, + "step": 2406 + }, + { + "epoch": 0.8886290918040857, + "grad_norm": 0.8483532071113586, + "learning_rate": 4.752392021721337e-06, + "loss": 0.6661, + "step": 2407 + }, + { + "epoch": 0.8889982771351218, + "grad_norm": 0.858508825302124, + "learning_rate": 4.752180936764044e-06, + "loss": 0.715, + "step": 2408 + }, + { + "epoch": 0.889367462466158, + "grad_norm": 0.850054919719696, + "learning_rate": 4.751969766562414e-06, + "loss": 0.6773, + "step": 2409 + }, + { + "epoch": 0.8897366477971942, + "grad_norm": 0.8784133195877075, + "learning_rate": 4.751758511124439e-06, + "loss": 0.6924, + "step": 2410 + }, + { + "epoch": 0.8901058331282303, + "grad_norm": 0.9067288041114807, + "learning_rate": 4.751547170458115e-06, + "loss": 0.698, + "step": 2411 + }, + { + "epoch": 0.8904750184592666, + "grad_norm": 0.8615005016326904, + "learning_rate": 4.751335744571441e-06, + "loss": 0.6891, + "step": 2412 + }, + { + "epoch": 0.8908442037903027, + "grad_norm": 0.8885693550109863, + "learning_rate": 4.75112423347242e-06, + "loss": 0.7179, + "step": 2413 + }, + { + "epoch": 0.891213389121339, + "grad_norm": 0.8749710321426392, + "learning_rate": 4.750912637169057e-06, + "loss": 0.7343, + "step": 2414 + }, + { + "epoch": 0.8915825744523751, + "grad_norm": 0.8808736205101013, + "learning_rate": 4.750700955669362e-06, + "loss": 0.7388, + "step": 2415 + }, + { + "epoch": 0.8919517597834112, + "grad_norm": 0.8593876957893372, + "learning_rate": 4.750489188981345e-06, + "loss": 0.6887, + "step": 2416 + }, + { + "epoch": 0.8923209451144475, + "grad_norm": 0.8734657168388367, + "learning_rate": 4.7502773371130225e-06, + "loss": 0.6933, + "step": 2417 + }, + { + "epoch": 0.8926901304454836, + "grad_norm": 0.8681308031082153, + "learning_rate": 4.750065400072413e-06, + "loss": 0.6763, + "step": 2418 + }, + { + "epoch": 0.8930593157765198, + "grad_norm": 0.8781231641769409, + "learning_rate": 4.7498533778675386e-06, + "loss": 0.6863, + "step": 2419 + }, + { + "epoch": 0.893428501107556, + "grad_norm": 1.038848876953125, + "learning_rate": 4.749641270506424e-06, + "loss": 0.6943, + "step": 2420 + }, + { + "epoch": 0.8937976864385921, + "grad_norm": 0.8760762214660645, + "learning_rate": 4.749429077997098e-06, + "loss": 0.7224, + "step": 2421 + }, + { + "epoch": 0.8941668717696284, + "grad_norm": 0.8649592995643616, + "learning_rate": 4.7492168003475894e-06, + "loss": 0.7052, + "step": 2422 + }, + { + "epoch": 0.8945360571006645, + "grad_norm": 0.8721056580543518, + "learning_rate": 4.749004437565936e-06, + "loss": 0.7032, + "step": 2423 + }, + { + "epoch": 0.8949052424317007, + "grad_norm": 0.9054531455039978, + "learning_rate": 4.748791989660174e-06, + "loss": 0.7161, + "step": 2424 + }, + { + "epoch": 0.8952744277627369, + "grad_norm": 0.8350791335105896, + "learning_rate": 4.748579456638346e-06, + "loss": 0.6682, + "step": 2425 + }, + { + "epoch": 0.895643613093773, + "grad_norm": 0.8753942251205444, + "learning_rate": 4.748366838508494e-06, + "loss": 0.6904, + "step": 2426 + }, + { + "epoch": 0.8960127984248093, + "grad_norm": 0.9068854451179504, + "learning_rate": 4.748154135278667e-06, + "loss": 0.7247, + "step": 2427 + }, + { + "epoch": 0.8963819837558454, + "grad_norm": 0.9158604145050049, + "learning_rate": 4.747941346956916e-06, + "loss": 0.7043, + "step": 2428 + }, + { + "epoch": 0.8967511690868816, + "grad_norm": 0.8779573440551758, + "learning_rate": 4.747728473551294e-06, + "loss": 0.7254, + "step": 2429 + }, + { + "epoch": 0.8971203544179178, + "grad_norm": 0.8369114398956299, + "learning_rate": 4.7475155150698585e-06, + "loss": 0.687, + "step": 2430 + }, + { + "epoch": 0.897489539748954, + "grad_norm": 0.8734737038612366, + "learning_rate": 4.747302471520671e-06, + "loss": 0.7011, + "step": 2431 + }, + { + "epoch": 0.8978587250799902, + "grad_norm": 0.8805413246154785, + "learning_rate": 4.747089342911793e-06, + "loss": 0.6706, + "step": 2432 + }, + { + "epoch": 0.8982279104110263, + "grad_norm": 0.8578121662139893, + "learning_rate": 4.746876129251293e-06, + "loss": 0.666, + "step": 2433 + }, + { + "epoch": 0.8985970957420625, + "grad_norm": 0.8652933835983276, + "learning_rate": 4.746662830547242e-06, + "loss": 0.6837, + "step": 2434 + }, + { + "epoch": 0.8989662810730987, + "grad_norm": 0.8619236350059509, + "learning_rate": 4.74644944680771e-06, + "loss": 0.7021, + "step": 2435 + }, + { + "epoch": 0.8993354664041349, + "grad_norm": 0.9408307075500488, + "learning_rate": 4.746235978040776e-06, + "loss": 0.7223, + "step": 2436 + }, + { + "epoch": 0.899704651735171, + "grad_norm": 0.8929667472839355, + "learning_rate": 4.74602242425452e-06, + "loss": 0.7132, + "step": 2437 + }, + { + "epoch": 0.9000738370662072, + "grad_norm": 0.8759931921958923, + "learning_rate": 4.745808785457023e-06, + "loss": 0.7078, + "step": 2438 + }, + { + "epoch": 0.9004430223972434, + "grad_norm": 0.8628905415534973, + "learning_rate": 4.745595061656372e-06, + "loss": 0.685, + "step": 2439 + }, + { + "epoch": 0.9008122077282796, + "grad_norm": 0.8453993797302246, + "learning_rate": 4.745381252860658e-06, + "loss": 0.6745, + "step": 2440 + }, + { + "epoch": 0.9011813930593158, + "grad_norm": 0.8623847365379333, + "learning_rate": 4.745167359077971e-06, + "loss": 0.6999, + "step": 2441 + }, + { + "epoch": 0.9015505783903519, + "grad_norm": 0.8913419246673584, + "learning_rate": 4.7449533803164085e-06, + "loss": 0.7087, + "step": 2442 + }, + { + "epoch": 0.9019197637213882, + "grad_norm": 0.8496670722961426, + "learning_rate": 4.744739316584069e-06, + "loss": 0.7265, + "step": 2443 + }, + { + "epoch": 0.9022889490524243, + "grad_norm": 0.8533394932746887, + "learning_rate": 4.7445251678890555e-06, + "loss": 0.7205, + "step": 2444 + }, + { + "epoch": 0.9026581343834605, + "grad_norm": 0.886719286441803, + "learning_rate": 4.744310934239472e-06, + "loss": 0.7136, + "step": 2445 + }, + { + "epoch": 0.9030273197144967, + "grad_norm": 0.8670817613601685, + "learning_rate": 4.744096615643428e-06, + "loss": 0.7349, + "step": 2446 + }, + { + "epoch": 0.9033965050455328, + "grad_norm": 0.8604776263237, + "learning_rate": 4.743882212109036e-06, + "loss": 0.6867, + "step": 2447 + }, + { + "epoch": 0.9037656903765691, + "grad_norm": 0.8604888916015625, + "learning_rate": 4.74366772364441e-06, + "loss": 0.6832, + "step": 2448 + }, + { + "epoch": 0.9041348757076052, + "grad_norm": 0.9036632776260376, + "learning_rate": 4.743453150257668e-06, + "loss": 0.7155, + "step": 2449 + }, + { + "epoch": 0.9045040610386414, + "grad_norm": 0.8700803518295288, + "learning_rate": 4.743238491956934e-06, + "loss": 0.6916, + "step": 2450 + }, + { + "epoch": 0.9048732463696776, + "grad_norm": 0.8630246520042419, + "learning_rate": 4.74302374875033e-06, + "loss": 0.6927, + "step": 2451 + }, + { + "epoch": 0.9052424317007137, + "grad_norm": 0.8841362595558167, + "learning_rate": 4.7428089206459845e-06, + "loss": 0.7137, + "step": 2452 + }, + { + "epoch": 0.90561161703175, + "grad_norm": 0.8694362044334412, + "learning_rate": 4.742594007652031e-06, + "loss": 0.6853, + "step": 2453 + }, + { + "epoch": 0.9059808023627861, + "grad_norm": 0.8494770526885986, + "learning_rate": 4.7423790097766006e-06, + "loss": 0.6711, + "step": 2454 + }, + { + "epoch": 0.9063499876938222, + "grad_norm": 0.8646619915962219, + "learning_rate": 4.742163927027833e-06, + "loss": 0.7179, + "step": 2455 + }, + { + "epoch": 0.9067191730248585, + "grad_norm": 0.877597987651825, + "learning_rate": 4.741948759413868e-06, + "loss": 0.7234, + "step": 2456 + }, + { + "epoch": 0.9070883583558946, + "grad_norm": 0.850517988204956, + "learning_rate": 4.741733506942849e-06, + "loss": 0.7318, + "step": 2457 + }, + { + "epoch": 0.9074575436869309, + "grad_norm": 0.8301242589950562, + "learning_rate": 4.741518169622926e-06, + "loss": 0.694, + "step": 2458 + }, + { + "epoch": 0.907826729017967, + "grad_norm": 0.8391947150230408, + "learning_rate": 4.741302747462248e-06, + "loss": 0.7091, + "step": 2459 + }, + { + "epoch": 0.9081959143490032, + "grad_norm": 0.8886072039604187, + "learning_rate": 4.741087240468967e-06, + "loss": 0.7058, + "step": 2460 + }, + { + "epoch": 0.9085650996800394, + "grad_norm": 0.8848855495452881, + "learning_rate": 4.7408716486512416e-06, + "loss": 0.7176, + "step": 2461 + }, + { + "epoch": 0.9089342850110755, + "grad_norm": 0.8657143712043762, + "learning_rate": 4.740655972017232e-06, + "loss": 0.72, + "step": 2462 + }, + { + "epoch": 0.9093034703421118, + "grad_norm": 0.879115104675293, + "learning_rate": 4.7404402105751e-06, + "loss": 0.7008, + "step": 2463 + }, + { + "epoch": 0.9096726556731479, + "grad_norm": 0.8512206673622131, + "learning_rate": 4.740224364333013e-06, + "loss": 0.7051, + "step": 2464 + }, + { + "epoch": 0.9100418410041841, + "grad_norm": 0.8688360452651978, + "learning_rate": 4.740008433299142e-06, + "loss": 0.7031, + "step": 2465 + }, + { + "epoch": 0.9104110263352203, + "grad_norm": 0.8307510614395142, + "learning_rate": 4.739792417481659e-06, + "loss": 0.6484, + "step": 2466 + }, + { + "epoch": 0.9107802116662564, + "grad_norm": 0.8932550549507141, + "learning_rate": 4.7395763168887395e-06, + "loss": 0.7204, + "step": 2467 + }, + { + "epoch": 0.9111493969972927, + "grad_norm": 0.8596043586730957, + "learning_rate": 4.739360131528563e-06, + "loss": 0.7266, + "step": 2468 + }, + { + "epoch": 0.9115185823283288, + "grad_norm": 0.8368245959281921, + "learning_rate": 4.739143861409312e-06, + "loss": 0.6906, + "step": 2469 + }, + { + "epoch": 0.911887767659365, + "grad_norm": 0.8623902797698975, + "learning_rate": 4.738927506539173e-06, + "loss": 0.6946, + "step": 2470 + }, + { + "epoch": 0.9122569529904012, + "grad_norm": 0.8611836433410645, + "learning_rate": 4.738711066926335e-06, + "loss": 0.6655, + "step": 2471 + }, + { + "epoch": 0.9126261383214374, + "grad_norm": 0.8788560032844543, + "learning_rate": 4.738494542578989e-06, + "loss": 0.7327, + "step": 2472 + }, + { + "epoch": 0.9129953236524735, + "grad_norm": 0.9105246663093567, + "learning_rate": 4.73827793350533e-06, + "loss": 0.7424, + "step": 2473 + }, + { + "epoch": 0.9133645089835097, + "grad_norm": 0.8795875310897827, + "learning_rate": 4.738061239713559e-06, + "loss": 0.7092, + "step": 2474 + }, + { + "epoch": 0.9137336943145459, + "grad_norm": 0.8684108257293701, + "learning_rate": 4.737844461211876e-06, + "loss": 0.6988, + "step": 2475 + }, + { + "epoch": 0.9141028796455821, + "grad_norm": 0.8691182136535645, + "learning_rate": 4.737627598008486e-06, + "loss": 0.7363, + "step": 2476 + }, + { + "epoch": 0.9144720649766183, + "grad_norm": 0.8747261166572571, + "learning_rate": 4.737410650111599e-06, + "loss": 0.6877, + "step": 2477 + }, + { + "epoch": 0.9148412503076544, + "grad_norm": 0.8895252346992493, + "learning_rate": 4.7371936175294246e-06, + "loss": 0.7078, + "step": 2478 + }, + { + "epoch": 0.9152104356386906, + "grad_norm": 0.877252459526062, + "learning_rate": 4.736976500270177e-06, + "loss": 0.7118, + "step": 2479 + }, + { + "epoch": 0.9155796209697268, + "grad_norm": 0.8511465787887573, + "learning_rate": 4.736759298342075e-06, + "loss": 0.6753, + "step": 2480 + }, + { + "epoch": 0.915948806300763, + "grad_norm": 0.8913504481315613, + "learning_rate": 4.7365420117533404e-06, + "loss": 0.6777, + "step": 2481 + }, + { + "epoch": 0.9163179916317992, + "grad_norm": 0.8873251080513, + "learning_rate": 4.736324640512195e-06, + "loss": 0.7191, + "step": 2482 + }, + { + "epoch": 0.9166871769628353, + "grad_norm": 0.8914928436279297, + "learning_rate": 4.736107184626869e-06, + "loss": 0.7094, + "step": 2483 + }, + { + "epoch": 0.9170563622938716, + "grad_norm": 0.8406426310539246, + "learning_rate": 4.735889644105591e-06, + "loss": 0.7082, + "step": 2484 + }, + { + "epoch": 0.9174255476249077, + "grad_norm": 0.9501475095748901, + "learning_rate": 4.735672018956596e-06, + "loss": 0.7158, + "step": 2485 + }, + { + "epoch": 0.9177947329559439, + "grad_norm": 0.8835409283638, + "learning_rate": 4.735454309188121e-06, + "loss": 0.6742, + "step": 2486 + }, + { + "epoch": 0.9181639182869801, + "grad_norm": 0.883091926574707, + "learning_rate": 4.735236514808406e-06, + "loss": 0.6994, + "step": 2487 + }, + { + "epoch": 0.9185331036180162, + "grad_norm": 0.8853040337562561, + "learning_rate": 4.735018635825693e-06, + "loss": 0.7197, + "step": 2488 + }, + { + "epoch": 0.9189022889490525, + "grad_norm": 0.8868618011474609, + "learning_rate": 4.734800672248231e-06, + "loss": 0.6744, + "step": 2489 + }, + { + "epoch": 0.9192714742800886, + "grad_norm": 0.9185392260551453, + "learning_rate": 4.73458262408427e-06, + "loss": 0.6616, + "step": 2490 + }, + { + "epoch": 0.9196406596111247, + "grad_norm": 0.8852180242538452, + "learning_rate": 4.734364491342061e-06, + "loss": 0.6921, + "step": 2491 + }, + { + "epoch": 0.920009844942161, + "grad_norm": 0.8949165344238281, + "learning_rate": 4.7341462740298605e-06, + "loss": 0.7063, + "step": 2492 + }, + { + "epoch": 0.9203790302731971, + "grad_norm": 0.8711039423942566, + "learning_rate": 4.7339279721559285e-06, + "loss": 0.7267, + "step": 2493 + }, + { + "epoch": 0.9207482156042334, + "grad_norm": 0.8843966126441956, + "learning_rate": 4.733709585728528e-06, + "loss": 0.7179, + "step": 2494 + }, + { + "epoch": 0.9211174009352695, + "grad_norm": 0.93259596824646, + "learning_rate": 4.733491114755926e-06, + "loss": 0.7065, + "step": 2495 + }, + { + "epoch": 0.9214865862663056, + "grad_norm": 0.9091447591781616, + "learning_rate": 4.733272559246389e-06, + "loss": 0.7224, + "step": 2496 + }, + { + "epoch": 0.9218557715973419, + "grad_norm": 0.8795716762542725, + "learning_rate": 4.73305391920819e-06, + "loss": 0.7119, + "step": 2497 + }, + { + "epoch": 0.922224956928378, + "grad_norm": 0.8672060966491699, + "learning_rate": 4.732835194649607e-06, + "loss": 0.7081, + "step": 2498 + }, + { + "epoch": 0.9225941422594143, + "grad_norm": 0.9056263566017151, + "learning_rate": 4.732616385578914e-06, + "loss": 0.7254, + "step": 2499 + }, + { + "epoch": 0.9229633275904504, + "grad_norm": 0.9208519458770752, + "learning_rate": 4.7323974920043965e-06, + "loss": 0.7183, + "step": 2500 + }, + { + "epoch": 0.9233325129214865, + "grad_norm": 0.8479480743408203, + "learning_rate": 4.732178513934339e-06, + "loss": 0.7086, + "step": 2501 + }, + { + "epoch": 0.9237016982525228, + "grad_norm": 0.84977126121521, + "learning_rate": 4.73195945137703e-06, + "loss": 0.701, + "step": 2502 + }, + { + "epoch": 0.9240708835835589, + "grad_norm": 0.8850725889205933, + "learning_rate": 4.7317403043407584e-06, + "loss": 0.6984, + "step": 2503 + }, + { + "epoch": 0.9244400689145952, + "grad_norm": 0.8803204298019409, + "learning_rate": 4.7315210728338215e-06, + "loss": 0.7318, + "step": 2504 + }, + { + "epoch": 0.9248092542456313, + "grad_norm": 0.8582308888435364, + "learning_rate": 4.731301756864516e-06, + "loss": 0.7015, + "step": 2505 + }, + { + "epoch": 0.9251784395766675, + "grad_norm": 0.8383595943450928, + "learning_rate": 4.731082356441143e-06, + "loss": 0.6385, + "step": 2506 + }, + { + "epoch": 0.9255476249077037, + "grad_norm": 0.8716678619384766, + "learning_rate": 4.730862871572008e-06, + "loss": 0.6989, + "step": 2507 + }, + { + "epoch": 0.9259168102387398, + "grad_norm": 0.8813826441764832, + "learning_rate": 4.730643302265416e-06, + "loss": 0.6775, + "step": 2508 + }, + { + "epoch": 0.926285995569776, + "grad_norm": 0.8515213131904602, + "learning_rate": 4.730423648529679e-06, + "loss": 0.7048, + "step": 2509 + }, + { + "epoch": 0.9266551809008122, + "grad_norm": 0.8710786700248718, + "learning_rate": 4.730203910373112e-06, + "loss": 0.7262, + "step": 2510 + }, + { + "epoch": 0.9270243662318484, + "grad_norm": 0.8492597341537476, + "learning_rate": 4.729984087804031e-06, + "loss": 0.7031, + "step": 2511 + }, + { + "epoch": 0.9273935515628846, + "grad_norm": 0.8640585541725159, + "learning_rate": 4.729764180830754e-06, + "loss": 0.683, + "step": 2512 + }, + { + "epoch": 0.9277627368939207, + "grad_norm": 0.85969078540802, + "learning_rate": 4.729544189461608e-06, + "loss": 0.6709, + "step": 2513 + }, + { + "epoch": 0.9281319222249569, + "grad_norm": 0.8955800533294678, + "learning_rate": 4.729324113704918e-06, + "loss": 0.7304, + "step": 2514 + }, + { + "epoch": 0.9285011075559931, + "grad_norm": 0.882088303565979, + "learning_rate": 4.729103953569014e-06, + "loss": 0.7078, + "step": 2515 + }, + { + "epoch": 0.9288702928870293, + "grad_norm": 0.8810111880302429, + "learning_rate": 4.728883709062229e-06, + "loss": 0.7412, + "step": 2516 + }, + { + "epoch": 0.9292394782180655, + "grad_norm": 0.9056873321533203, + "learning_rate": 4.728663380192898e-06, + "loss": 0.7202, + "step": 2517 + }, + { + "epoch": 0.9296086635491017, + "grad_norm": 0.8942249417304993, + "learning_rate": 4.728442966969363e-06, + "loss": 0.6895, + "step": 2518 + }, + { + "epoch": 0.9299778488801378, + "grad_norm": 0.8909201622009277, + "learning_rate": 4.728222469399964e-06, + "loss": 0.7394, + "step": 2519 + }, + { + "epoch": 0.930347034211174, + "grad_norm": 0.8551014065742493, + "learning_rate": 4.728001887493048e-06, + "loss": 0.6906, + "step": 2520 + }, + { + "epoch": 0.9307162195422102, + "grad_norm": 0.8875094652175903, + "learning_rate": 4.727781221256963e-06, + "loss": 0.7134, + "step": 2521 + }, + { + "epoch": 0.9310854048732464, + "grad_norm": 0.8559072613716125, + "learning_rate": 4.727560470700064e-06, + "loss": 0.6838, + "step": 2522 + }, + { + "epoch": 0.9314545902042826, + "grad_norm": 0.8632515668869019, + "learning_rate": 4.727339635830702e-06, + "loss": 0.6783, + "step": 2523 + }, + { + "epoch": 0.9318237755353187, + "grad_norm": 0.8152856230735779, + "learning_rate": 4.727118716657239e-06, + "loss": 0.6142, + "step": 2524 + }, + { + "epoch": 0.932192960866355, + "grad_norm": 0.9158169031143188, + "learning_rate": 4.726897713188035e-06, + "loss": 0.7147, + "step": 2525 + }, + { + "epoch": 0.9325621461973911, + "grad_norm": 0.8849809765815735, + "learning_rate": 4.726676625431454e-06, + "loss": 0.6753, + "step": 2526 + }, + { + "epoch": 0.9329313315284272, + "grad_norm": 0.8620732426643372, + "learning_rate": 4.726455453395867e-06, + "loss": 0.6979, + "step": 2527 + }, + { + "epoch": 0.9333005168594635, + "grad_norm": 0.8598245978355408, + "learning_rate": 4.726234197089644e-06, + "loss": 0.7447, + "step": 2528 + }, + { + "epoch": 0.9336697021904996, + "grad_norm": 0.8678368330001831, + "learning_rate": 4.726012856521158e-06, + "loss": 0.693, + "step": 2529 + }, + { + "epoch": 0.9340388875215359, + "grad_norm": 0.8540114164352417, + "learning_rate": 4.72579143169879e-06, + "loss": 0.6663, + "step": 2530 + }, + { + "epoch": 0.934408072852572, + "grad_norm": 0.8417086601257324, + "learning_rate": 4.725569922630917e-06, + "loss": 0.689, + "step": 2531 + }, + { + "epoch": 0.9347772581836081, + "grad_norm": 0.8707804679870605, + "learning_rate": 4.725348329325925e-06, + "loss": 0.7039, + "step": 2532 + }, + { + "epoch": 0.9351464435146444, + "grad_norm": 0.8656783103942871, + "learning_rate": 4.725126651792202e-06, + "loss": 0.7056, + "step": 2533 + }, + { + "epoch": 0.9355156288456805, + "grad_norm": 0.8286459445953369, + "learning_rate": 4.724904890038137e-06, + "loss": 0.6859, + "step": 2534 + }, + { + "epoch": 0.9358848141767168, + "grad_norm": 0.8686536550521851, + "learning_rate": 4.724683044072124e-06, + "loss": 0.6817, + "step": 2535 + }, + { + "epoch": 0.9362539995077529, + "grad_norm": 0.8872658014297485, + "learning_rate": 4.7244611139025595e-06, + "loss": 0.7093, + "step": 2536 + }, + { + "epoch": 0.936623184838789, + "grad_norm": 0.8674843907356262, + "learning_rate": 4.724239099537845e-06, + "loss": 0.7042, + "step": 2537 + }, + { + "epoch": 0.9369923701698253, + "grad_norm": 0.8639495968818665, + "learning_rate": 4.7240170009863816e-06, + "loss": 0.7111, + "step": 2538 + }, + { + "epoch": 0.9373615555008614, + "grad_norm": 0.8369062542915344, + "learning_rate": 4.7237948182565765e-06, + "loss": 0.6811, + "step": 2539 + }, + { + "epoch": 0.9377307408318976, + "grad_norm": 0.8846485614776611, + "learning_rate": 4.72357255135684e-06, + "loss": 0.6932, + "step": 2540 + }, + { + "epoch": 0.9380999261629338, + "grad_norm": 0.8939961194992065, + "learning_rate": 4.723350200295584e-06, + "loss": 0.7092, + "step": 2541 + }, + { + "epoch": 0.9384691114939699, + "grad_norm": 0.864433228969574, + "learning_rate": 4.723127765081225e-06, + "loss": 0.6878, + "step": 2542 + }, + { + "epoch": 0.9388382968250062, + "grad_norm": 0.8481778502464294, + "learning_rate": 4.7229052457221816e-06, + "loss": 0.6775, + "step": 2543 + }, + { + "epoch": 0.9392074821560423, + "grad_norm": 0.8581600785255432, + "learning_rate": 4.722682642226875e-06, + "loss": 0.6961, + "step": 2544 + }, + { + "epoch": 0.9395766674870785, + "grad_norm": 0.8689659833908081, + "learning_rate": 4.722459954603733e-06, + "loss": 0.6986, + "step": 2545 + }, + { + "epoch": 0.9399458528181147, + "grad_norm": 0.8631998896598816, + "learning_rate": 4.722237182861183e-06, + "loss": 0.6756, + "step": 2546 + }, + { + "epoch": 0.9403150381491509, + "grad_norm": 0.8720018863677979, + "learning_rate": 4.722014327007657e-06, + "loss": 0.7085, + "step": 2547 + }, + { + "epoch": 0.9406842234801871, + "grad_norm": 0.8763655424118042, + "learning_rate": 4.72179138705159e-06, + "loss": 0.6919, + "step": 2548 + }, + { + "epoch": 0.9410534088112232, + "grad_norm": 0.8881059288978577, + "learning_rate": 4.72156836300142e-06, + "loss": 0.6924, + "step": 2549 + }, + { + "epoch": 0.9414225941422594, + "grad_norm": 0.8810322284698486, + "learning_rate": 4.721345254865589e-06, + "loss": 0.7164, + "step": 2550 + }, + { + "epoch": 0.9417917794732956, + "grad_norm": 0.8332381248474121, + "learning_rate": 4.721122062652541e-06, + "loss": 0.7418, + "step": 2551 + }, + { + "epoch": 0.9421609648043318, + "grad_norm": 0.8442291617393494, + "learning_rate": 4.720898786370723e-06, + "loss": 0.6884, + "step": 2552 + }, + { + "epoch": 0.942530150135368, + "grad_norm": 0.8065565824508667, + "learning_rate": 4.720675426028588e-06, + "loss": 0.6191, + "step": 2553 + }, + { + "epoch": 0.9428993354664041, + "grad_norm": 0.8640344142913818, + "learning_rate": 4.720451981634589e-06, + "loss": 0.7072, + "step": 2554 + }, + { + "epoch": 0.9432685207974403, + "grad_norm": 0.8636446595191956, + "learning_rate": 4.720228453197183e-06, + "loss": 0.6901, + "step": 2555 + }, + { + "epoch": 0.9436377061284765, + "grad_norm": 0.8631062507629395, + "learning_rate": 4.720004840724831e-06, + "loss": 0.7035, + "step": 2556 + }, + { + "epoch": 0.9440068914595127, + "grad_norm": 0.8414477705955505, + "learning_rate": 4.7197811442259955e-06, + "loss": 0.6541, + "step": 2557 + }, + { + "epoch": 0.9443760767905488, + "grad_norm": 0.8783524632453918, + "learning_rate": 4.719557363709145e-06, + "loss": 0.6971, + "step": 2558 + }, + { + "epoch": 0.944745262121585, + "grad_norm": 0.8537473678588867, + "learning_rate": 4.7193334991827486e-06, + "loss": 0.6927, + "step": 2559 + }, + { + "epoch": 0.9451144474526212, + "grad_norm": 0.9084930419921875, + "learning_rate": 4.7191095506552795e-06, + "loss": 0.6974, + "step": 2560 + }, + { + "epoch": 0.9454836327836574, + "grad_norm": 0.9035632610321045, + "learning_rate": 4.718885518135215e-06, + "loss": 0.746, + "step": 2561 + }, + { + "epoch": 0.9458528181146936, + "grad_norm": 0.8823369741439819, + "learning_rate": 4.718661401631033e-06, + "loss": 0.6662, + "step": 2562 + }, + { + "epoch": 0.9462220034457297, + "grad_norm": 0.8840197324752808, + "learning_rate": 4.718437201151218e-06, + "loss": 0.692, + "step": 2563 + }, + { + "epoch": 0.946591188776766, + "grad_norm": 0.865744948387146, + "learning_rate": 4.718212916704254e-06, + "loss": 0.7252, + "step": 2564 + }, + { + "epoch": 0.9469603741078021, + "grad_norm": 0.8541647791862488, + "learning_rate": 4.717988548298633e-06, + "loss": 0.69, + "step": 2565 + }, + { + "epoch": 0.9473295594388383, + "grad_norm": 0.9258102774620056, + "learning_rate": 4.717764095942844e-06, + "loss": 0.7265, + "step": 2566 + }, + { + "epoch": 0.9476987447698745, + "grad_norm": 0.8850582242012024, + "learning_rate": 4.717539559645384e-06, + "loss": 0.7063, + "step": 2567 + }, + { + "epoch": 0.9480679301009106, + "grad_norm": 0.8705160021781921, + "learning_rate": 4.717314939414752e-06, + "loss": 0.713, + "step": 2568 + }, + { + "epoch": 0.9484371154319469, + "grad_norm": 0.8894596099853516, + "learning_rate": 4.717090235259449e-06, + "loss": 0.6615, + "step": 2569 + }, + { + "epoch": 0.948806300762983, + "grad_norm": 0.8761441707611084, + "learning_rate": 4.7168654471879806e-06, + "loss": 0.6738, + "step": 2570 + }, + { + "epoch": 0.9491754860940192, + "grad_norm": 0.9032109975814819, + "learning_rate": 4.716640575208855e-06, + "loss": 0.7081, + "step": 2571 + }, + { + "epoch": 0.9495446714250554, + "grad_norm": 0.8425540924072266, + "learning_rate": 4.716415619330582e-06, + "loss": 0.6321, + "step": 2572 + }, + { + "epoch": 0.9499138567560915, + "grad_norm": 0.87245112657547, + "learning_rate": 4.716190579561678e-06, + "loss": 0.7024, + "step": 2573 + }, + { + "epoch": 0.9502830420871278, + "grad_norm": 0.8776750564575195, + "learning_rate": 4.71596545591066e-06, + "loss": 0.6897, + "step": 2574 + }, + { + "epoch": 0.9506522274181639, + "grad_norm": 0.8822212815284729, + "learning_rate": 4.7157402483860496e-06, + "loss": 0.6829, + "step": 2575 + }, + { + "epoch": 0.9510214127492, + "grad_norm": 0.8839916586875916, + "learning_rate": 4.7155149569963696e-06, + "loss": 0.6821, + "step": 2576 + }, + { + "epoch": 0.9513905980802363, + "grad_norm": 0.8598265647888184, + "learning_rate": 4.715289581750147e-06, + "loss": 0.714, + "step": 2577 + }, + { + "epoch": 0.9517597834112724, + "grad_norm": 0.8639992475509644, + "learning_rate": 4.7150641226559136e-06, + "loss": 0.7089, + "step": 2578 + }, + { + "epoch": 0.9521289687423087, + "grad_norm": 0.8500651717185974, + "learning_rate": 4.714838579722202e-06, + "loss": 0.6681, + "step": 2579 + }, + { + "epoch": 0.9524981540733448, + "grad_norm": 0.8705741763114929, + "learning_rate": 4.714612952957549e-06, + "loss": 0.6966, + "step": 2580 + }, + { + "epoch": 0.952867339404381, + "grad_norm": 0.8744728565216064, + "learning_rate": 4.714387242370495e-06, + "loss": 0.7105, + "step": 2581 + }, + { + "epoch": 0.9532365247354172, + "grad_norm": 0.844121515750885, + "learning_rate": 4.714161447969583e-06, + "loss": 0.6552, + "step": 2582 + }, + { + "epoch": 0.9536057100664533, + "grad_norm": 0.8838944435119629, + "learning_rate": 4.7139355697633584e-06, + "loss": 0.6735, + "step": 2583 + }, + { + "epoch": 0.9539748953974896, + "grad_norm": 0.8674659132957458, + "learning_rate": 4.713709607760371e-06, + "loss": 0.6755, + "step": 2584 + }, + { + "epoch": 0.9543440807285257, + "grad_norm": 0.8746424913406372, + "learning_rate": 4.713483561969175e-06, + "loss": 0.7173, + "step": 2585 + }, + { + "epoch": 0.9547132660595619, + "grad_norm": 0.8750790357589722, + "learning_rate": 4.713257432398324e-06, + "loss": 0.7156, + "step": 2586 + }, + { + "epoch": 0.9550824513905981, + "grad_norm": 0.9194653034210205, + "learning_rate": 4.713031219056377e-06, + "loss": 0.7156, + "step": 2587 + }, + { + "epoch": 0.9554516367216342, + "grad_norm": 0.8654546737670898, + "learning_rate": 4.712804921951898e-06, + "loss": 0.6755, + "step": 2588 + }, + { + "epoch": 0.9558208220526705, + "grad_norm": 0.8833775520324707, + "learning_rate": 4.71257854109345e-06, + "loss": 0.6777, + "step": 2589 + }, + { + "epoch": 0.9561900073837066, + "grad_norm": 0.8929355144500732, + "learning_rate": 4.712352076489603e-06, + "loss": 0.7201, + "step": 2590 + }, + { + "epoch": 0.9565591927147428, + "grad_norm": 0.8803203105926514, + "learning_rate": 4.7121255281489275e-06, + "loss": 0.7037, + "step": 2591 + }, + { + "epoch": 0.956928378045779, + "grad_norm": 3.147451877593994, + "learning_rate": 4.71189889608e-06, + "loss": 0.7059, + "step": 2592 + }, + { + "epoch": 0.9572975633768152, + "grad_norm": 0.8626553416252136, + "learning_rate": 4.711672180291397e-06, + "loss": 0.7017, + "step": 2593 + }, + { + "epoch": 0.9576667487078513, + "grad_norm": 0.8505844473838806, + "learning_rate": 4.711445380791699e-06, + "loss": 0.6857, + "step": 2594 + }, + { + "epoch": 0.9580359340388875, + "grad_norm": 0.8895529508590698, + "learning_rate": 4.711218497589493e-06, + "loss": 0.7022, + "step": 2595 + }, + { + "epoch": 0.9584051193699237, + "grad_norm": 0.8883222341537476, + "learning_rate": 4.710991530693364e-06, + "loss": 0.6423, + "step": 2596 + }, + { + "epoch": 0.9587743047009599, + "grad_norm": 0.8487628102302551, + "learning_rate": 4.710764480111903e-06, + "loss": 0.6979, + "step": 2597 + }, + { + "epoch": 0.9591434900319961, + "grad_norm": 0.8244770765304565, + "learning_rate": 4.710537345853704e-06, + "loss": 0.6925, + "step": 2598 + }, + { + "epoch": 0.9595126753630322, + "grad_norm": 0.9382370710372925, + "learning_rate": 4.710310127927364e-06, + "loss": 0.7106, + "step": 2599 + }, + { + "epoch": 0.9598818606940684, + "grad_norm": 0.8556570410728455, + "learning_rate": 4.710082826341484e-06, + "loss": 0.6918, + "step": 2600 + }, + { + "epoch": 0.9602510460251046, + "grad_norm": 0.8350138068199158, + "learning_rate": 4.709855441104667e-06, + "loss": 0.6908, + "step": 2601 + }, + { + "epoch": 0.9606202313561408, + "grad_norm": 0.8622647523880005, + "learning_rate": 4.7096279722255175e-06, + "loss": 0.7214, + "step": 2602 + }, + { + "epoch": 0.960989416687177, + "grad_norm": 0.8928597569465637, + "learning_rate": 4.709400419712648e-06, + "loss": 0.7067, + "step": 2603 + }, + { + "epoch": 0.9613586020182131, + "grad_norm": 0.8485970497131348, + "learning_rate": 4.709172783574669e-06, + "loss": 0.691, + "step": 2604 + }, + { + "epoch": 0.9617277873492494, + "grad_norm": 0.8822405934333801, + "learning_rate": 4.708945063820198e-06, + "loss": 0.684, + "step": 2605 + }, + { + "epoch": 0.9620969726802855, + "grad_norm": 0.854975700378418, + "learning_rate": 4.708717260457853e-06, + "loss": 0.6718, + "step": 2606 + }, + { + "epoch": 0.9624661580113217, + "grad_norm": 0.86021488904953, + "learning_rate": 4.7084893734962565e-06, + "loss": 0.6677, + "step": 2607 + }, + { + "epoch": 0.9628353433423579, + "grad_norm": 0.9012849926948547, + "learning_rate": 4.708261402944036e-06, + "loss": 0.7292, + "step": 2608 + }, + { + "epoch": 0.963204528673394, + "grad_norm": 0.864490270614624, + "learning_rate": 4.708033348809816e-06, + "loss": 0.7166, + "step": 2609 + }, + { + "epoch": 0.9635737140044303, + "grad_norm": 0.8407897353172302, + "learning_rate": 4.707805211102232e-06, + "loss": 0.7277, + "step": 2610 + }, + { + "epoch": 0.9639428993354664, + "grad_norm": 0.8841310739517212, + "learning_rate": 4.707576989829917e-06, + "loss": 0.6973, + "step": 2611 + }, + { + "epoch": 0.9643120846665025, + "grad_norm": 0.887823760509491, + "learning_rate": 4.7073486850015095e-06, + "loss": 0.71, + "step": 2612 + }, + { + "epoch": 0.9646812699975388, + "grad_norm": 0.8758772015571594, + "learning_rate": 4.707120296625651e-06, + "loss": 0.6605, + "step": 2613 + }, + { + "epoch": 0.9650504553285749, + "grad_norm": 0.8583731055259705, + "learning_rate": 4.7068918247109865e-06, + "loss": 0.6964, + "step": 2614 + }, + { + "epoch": 0.9654196406596112, + "grad_norm": 0.8955153822898865, + "learning_rate": 4.706663269266163e-06, + "loss": 0.7276, + "step": 2615 + }, + { + "epoch": 0.9657888259906473, + "grad_norm": 0.8411804437637329, + "learning_rate": 4.70643463029983e-06, + "loss": 0.6319, + "step": 2616 + }, + { + "epoch": 0.9661580113216834, + "grad_norm": 0.8701980710029602, + "learning_rate": 4.706205907820643e-06, + "loss": 0.6796, + "step": 2617 + }, + { + "epoch": 0.9665271966527197, + "grad_norm": 0.8797522783279419, + "learning_rate": 4.705977101837259e-06, + "loss": 0.7139, + "step": 2618 + }, + { + "epoch": 0.9668963819837558, + "grad_norm": 0.8831002712249756, + "learning_rate": 4.705748212358339e-06, + "loss": 0.6813, + "step": 2619 + }, + { + "epoch": 0.9672655673147921, + "grad_norm": 0.8374364376068115, + "learning_rate": 4.705519239392544e-06, + "loss": 0.6509, + "step": 2620 + }, + { + "epoch": 0.9676347526458282, + "grad_norm": 0.8649409413337708, + "learning_rate": 4.705290182948542e-06, + "loss": 0.6692, + "step": 2621 + }, + { + "epoch": 0.9680039379768643, + "grad_norm": 0.9725558757781982, + "learning_rate": 4.705061043035002e-06, + "loss": 0.6921, + "step": 2622 + }, + { + "epoch": 0.9683731233079006, + "grad_norm": 1.5726717710494995, + "learning_rate": 4.704831819660598e-06, + "loss": 0.73, + "step": 2623 + }, + { + "epoch": 0.9687423086389367, + "grad_norm": 0.8848654627799988, + "learning_rate": 4.704602512834006e-06, + "loss": 0.692, + "step": 2624 + }, + { + "epoch": 0.969111493969973, + "grad_norm": 0.8604928255081177, + "learning_rate": 4.7043731225639045e-06, + "loss": 0.6849, + "step": 2625 + }, + { + "epoch": 0.9694806793010091, + "grad_norm": 0.9043660759925842, + "learning_rate": 4.704143648858976e-06, + "loss": 0.7108, + "step": 2626 + }, + { + "epoch": 0.9698498646320453, + "grad_norm": 0.8489352464675903, + "learning_rate": 4.703914091727906e-06, + "loss": 0.6897, + "step": 2627 + }, + { + "epoch": 0.9702190499630815, + "grad_norm": 0.8445559144020081, + "learning_rate": 4.703684451179382e-06, + "loss": 0.7079, + "step": 2628 + }, + { + "epoch": 0.9705882352941176, + "grad_norm": 0.8575627207756042, + "learning_rate": 4.7034547272220985e-06, + "loss": 0.7166, + "step": 2629 + }, + { + "epoch": 0.9709574206251538, + "grad_norm": 0.8811299800872803, + "learning_rate": 4.703224919864748e-06, + "loss": 0.6726, + "step": 2630 + }, + { + "epoch": 0.97132660595619, + "grad_norm": 0.8279196619987488, + "learning_rate": 4.702995029116031e-06, + "loss": 0.6713, + "step": 2631 + }, + { + "epoch": 0.9716957912872262, + "grad_norm": 0.8848355412483215, + "learning_rate": 4.702765054984646e-06, + "loss": 0.6671, + "step": 2632 + }, + { + "epoch": 0.9720649766182624, + "grad_norm": 0.8621148467063904, + "learning_rate": 4.7025349974793e-06, + "loss": 0.6558, + "step": 2633 + }, + { + "epoch": 0.9724341619492985, + "grad_norm": 0.8555501699447632, + "learning_rate": 4.702304856608698e-06, + "loss": 0.7102, + "step": 2634 + }, + { + "epoch": 0.9728033472803347, + "grad_norm": 0.8470124006271362, + "learning_rate": 4.702074632381553e-06, + "loss": 0.6579, + "step": 2635 + }, + { + "epoch": 0.9731725326113709, + "grad_norm": 0.8950283527374268, + "learning_rate": 4.701844324806579e-06, + "loss": 0.7557, + "step": 2636 + }, + { + "epoch": 0.9735417179424071, + "grad_norm": 0.8791554570198059, + "learning_rate": 4.701613933892491e-06, + "loss": 0.7336, + "step": 2637 + }, + { + "epoch": 0.9739109032734433, + "grad_norm": 0.8484707474708557, + "learning_rate": 4.701383459648011e-06, + "loss": 0.6983, + "step": 2638 + }, + { + "epoch": 0.9742800886044795, + "grad_norm": 0.8885069489479065, + "learning_rate": 4.701152902081863e-06, + "loss": 0.6982, + "step": 2639 + }, + { + "epoch": 0.9746492739355156, + "grad_norm": 0.8474615216255188, + "learning_rate": 4.700922261202771e-06, + "loss": 0.6982, + "step": 2640 + }, + { + "epoch": 0.9750184592665518, + "grad_norm": 0.8749250769615173, + "learning_rate": 4.7006915370194655e-06, + "loss": 0.6771, + "step": 2641 + }, + { + "epoch": 0.975387644597588, + "grad_norm": 0.8137475848197937, + "learning_rate": 4.70046072954068e-06, + "loss": 0.6859, + "step": 2642 + }, + { + "epoch": 0.9757568299286241, + "grad_norm": 0.8566490411758423, + "learning_rate": 4.700229838775151e-06, + "loss": 0.7159, + "step": 2643 + }, + { + "epoch": 0.9761260152596604, + "grad_norm": 0.8723783493041992, + "learning_rate": 4.699998864731617e-06, + "loss": 0.7309, + "step": 2644 + }, + { + "epoch": 0.9764952005906965, + "grad_norm": 0.8612366914749146, + "learning_rate": 4.699767807418821e-06, + "loss": 0.6921, + "step": 2645 + }, + { + "epoch": 0.9768643859217327, + "grad_norm": 0.8468891382217407, + "learning_rate": 4.699536666845507e-06, + "loss": 0.6677, + "step": 2646 + }, + { + "epoch": 0.9772335712527689, + "grad_norm": 0.8550125956535339, + "learning_rate": 4.699305443020424e-06, + "loss": 0.6939, + "step": 2647 + }, + { + "epoch": 0.977602756583805, + "grad_norm": 0.9459806680679321, + "learning_rate": 4.699074135952324e-06, + "loss": 0.6429, + "step": 2648 + }, + { + "epoch": 0.9779719419148413, + "grad_norm": 0.8765318989753723, + "learning_rate": 4.6988427456499624e-06, + "loss": 0.6997, + "step": 2649 + }, + { + "epoch": 0.9783411272458774, + "grad_norm": 0.8627922534942627, + "learning_rate": 4.698611272122097e-06, + "loss": 0.6853, + "step": 2650 + }, + { + "epoch": 0.9787103125769137, + "grad_norm": 0.8556888103485107, + "learning_rate": 4.698379715377489e-06, + "loss": 0.6969, + "step": 2651 + }, + { + "epoch": 0.9790794979079498, + "grad_norm": 0.8721787929534912, + "learning_rate": 4.698148075424902e-06, + "loss": 0.6976, + "step": 2652 + }, + { + "epoch": 0.9794486832389859, + "grad_norm": 0.8606870770454407, + "learning_rate": 4.697916352273104e-06, + "loss": 0.6786, + "step": 2653 + }, + { + "epoch": 0.9798178685700222, + "grad_norm": 0.8361164927482605, + "learning_rate": 4.6976845459308664e-06, + "loss": 0.6847, + "step": 2654 + }, + { + "epoch": 0.9801870539010583, + "grad_norm": 0.8861737847328186, + "learning_rate": 4.697452656406963e-06, + "loss": 0.721, + "step": 2655 + }, + { + "epoch": 0.9805562392320946, + "grad_norm": 0.8805361390113831, + "learning_rate": 4.697220683710168e-06, + "loss": 0.681, + "step": 2656 + }, + { + "epoch": 0.9809254245631307, + "grad_norm": 0.8715226650238037, + "learning_rate": 4.696988627849265e-06, + "loss": 0.7021, + "step": 2657 + }, + { + "epoch": 0.9812946098941668, + "grad_norm": 0.866218090057373, + "learning_rate": 4.696756488833035e-06, + "loss": 0.6957, + "step": 2658 + }, + { + "epoch": 0.9816637952252031, + "grad_norm": 0.8797804713249207, + "learning_rate": 4.696524266670266e-06, + "loss": 0.7064, + "step": 2659 + }, + { + "epoch": 0.9820329805562392, + "grad_norm": 0.9008505940437317, + "learning_rate": 4.696291961369747e-06, + "loss": 0.6895, + "step": 2660 + }, + { + "epoch": 0.9824021658872754, + "grad_norm": 0.8940179944038391, + "learning_rate": 4.696059572940269e-06, + "loss": 0.6735, + "step": 2661 + }, + { + "epoch": 0.9827713512183116, + "grad_norm": 0.8762912750244141, + "learning_rate": 4.69582710139063e-06, + "loss": 0.7487, + "step": 2662 + }, + { + "epoch": 0.9831405365493477, + "grad_norm": 0.8566141724586487, + "learning_rate": 4.695594546729629e-06, + "loss": 0.7056, + "step": 2663 + }, + { + "epoch": 0.983509721880384, + "grad_norm": 0.8390647172927856, + "learning_rate": 4.695361908966066e-06, + "loss": 0.6885, + "step": 2664 + }, + { + "epoch": 0.9838789072114201, + "grad_norm": 0.8426521420478821, + "learning_rate": 4.6951291881087495e-06, + "loss": 0.7038, + "step": 2665 + }, + { + "epoch": 0.9842480925424563, + "grad_norm": 0.8740254640579224, + "learning_rate": 4.6948963841664845e-06, + "loss": 0.7153, + "step": 2666 + }, + { + "epoch": 0.9846172778734925, + "grad_norm": 0.8205375075340271, + "learning_rate": 4.694663497148084e-06, + "loss": 0.7011, + "step": 2667 + }, + { + "epoch": 0.9849864632045287, + "grad_norm": 0.8569832444190979, + "learning_rate": 4.6944305270623634e-06, + "loss": 0.6939, + "step": 2668 + }, + { + "epoch": 0.9853556485355649, + "grad_norm": 0.8959291577339172, + "learning_rate": 4.694197473918139e-06, + "loss": 0.7057, + "step": 2669 + }, + { + "epoch": 0.985724833866601, + "grad_norm": 0.8417198657989502, + "learning_rate": 4.6939643377242335e-06, + "loss": 0.6881, + "step": 2670 + }, + { + "epoch": 0.9860940191976372, + "grad_norm": 0.8587856888771057, + "learning_rate": 4.693731118489471e-06, + "loss": 0.7073, + "step": 2671 + }, + { + "epoch": 0.9864632045286734, + "grad_norm": 0.8757145404815674, + "learning_rate": 4.693497816222676e-06, + "loss": 0.7277, + "step": 2672 + }, + { + "epoch": 0.9868323898597096, + "grad_norm": 0.8414481282234192, + "learning_rate": 4.693264430932683e-06, + "loss": 0.68, + "step": 2673 + }, + { + "epoch": 0.9872015751907458, + "grad_norm": 0.8073148131370544, + "learning_rate": 4.693030962628322e-06, + "loss": 0.6434, + "step": 2674 + }, + { + "epoch": 0.9875707605217819, + "grad_norm": 0.8849506378173828, + "learning_rate": 4.692797411318432e-06, + "loss": 0.7039, + "step": 2675 + }, + { + "epoch": 0.9879399458528181, + "grad_norm": 0.829002857208252, + "learning_rate": 4.692563777011852e-06, + "loss": 0.6811, + "step": 2676 + }, + { + "epoch": 0.9883091311838543, + "grad_norm": 0.8336803317070007, + "learning_rate": 4.692330059717425e-06, + "loss": 0.6915, + "step": 2677 + }, + { + "epoch": 0.9886783165148905, + "grad_norm": 0.8502470850944519, + "learning_rate": 4.6920962594439965e-06, + "loss": 0.6678, + "step": 2678 + }, + { + "epoch": 0.9890475018459266, + "grad_norm": 0.8810563683509827, + "learning_rate": 4.691862376200418e-06, + "loss": 0.702, + "step": 2679 + }, + { + "epoch": 0.9894166871769629, + "grad_norm": 0.8533656597137451, + "learning_rate": 4.691628409995539e-06, + "loss": 0.6748, + "step": 2680 + }, + { + "epoch": 0.989785872507999, + "grad_norm": 0.8637438416481018, + "learning_rate": 4.6913943608382166e-06, + "loss": 0.7256, + "step": 2681 + }, + { + "epoch": 0.9901550578390352, + "grad_norm": 0.8353433609008789, + "learning_rate": 4.691160228737309e-06, + "loss": 0.6965, + "step": 2682 + }, + { + "epoch": 0.9905242431700714, + "grad_norm": 0.8654287457466125, + "learning_rate": 4.690926013701678e-06, + "loss": 0.6864, + "step": 2683 + }, + { + "epoch": 0.9908934285011075, + "grad_norm": 0.8387684226036072, + "learning_rate": 4.69069171574019e-06, + "loss": 0.6389, + "step": 2684 + }, + { + "epoch": 0.9912626138321438, + "grad_norm": 0.908822774887085, + "learning_rate": 4.690457334861711e-06, + "loss": 0.7105, + "step": 2685 + }, + { + "epoch": 0.9916317991631799, + "grad_norm": 0.8545365333557129, + "learning_rate": 4.690222871075114e-06, + "loss": 0.722, + "step": 2686 + }, + { + "epoch": 0.9920009844942161, + "grad_norm": 0.8538405299186707, + "learning_rate": 4.689988324389272e-06, + "loss": 0.6503, + "step": 2687 + }, + { + "epoch": 0.9923701698252523, + "grad_norm": 0.8731146454811096, + "learning_rate": 4.689753694813063e-06, + "loss": 0.6982, + "step": 2688 + }, + { + "epoch": 0.9927393551562884, + "grad_norm": 0.8791890144348145, + "learning_rate": 4.689518982355369e-06, + "loss": 0.6837, + "step": 2689 + }, + { + "epoch": 0.9931085404873247, + "grad_norm": 0.8614819049835205, + "learning_rate": 4.68928418702507e-06, + "loss": 0.7137, + "step": 2690 + }, + { + "epoch": 0.9934777258183608, + "grad_norm": 0.8743491768836975, + "learning_rate": 4.689049308831057e-06, + "loss": 0.7015, + "step": 2691 + }, + { + "epoch": 0.993846911149397, + "grad_norm": 0.8765535354614258, + "learning_rate": 4.688814347782219e-06, + "loss": 0.6774, + "step": 2692 + }, + { + "epoch": 0.9942160964804332, + "grad_norm": 0.8722323775291443, + "learning_rate": 4.6885793038874486e-06, + "loss": 0.6862, + "step": 2693 + }, + { + "epoch": 0.9945852818114693, + "grad_norm": 0.8379456400871277, + "learning_rate": 4.688344177155642e-06, + "loss": 0.6752, + "step": 2694 + }, + { + "epoch": 0.9949544671425056, + "grad_norm": 0.8696368932723999, + "learning_rate": 4.688108967595699e-06, + "loss": 0.6786, + "step": 2695 + }, + { + "epoch": 0.9953236524735417, + "grad_norm": 0.8123127818107605, + "learning_rate": 4.687873675216522e-06, + "loss": 0.6704, + "step": 2696 + }, + { + "epoch": 0.9956928378045778, + "grad_norm": 0.8309715390205383, + "learning_rate": 4.687638300027016e-06, + "loss": 0.6552, + "step": 2697 + }, + { + "epoch": 0.9960620231356141, + "grad_norm": 0.8737226724624634, + "learning_rate": 4.687402842036092e-06, + "loss": 0.7223, + "step": 2698 + }, + { + "epoch": 0.9964312084666502, + "grad_norm": 0.8684542179107666, + "learning_rate": 4.687167301252661e-06, + "loss": 0.6992, + "step": 2699 + }, + { + "epoch": 0.9968003937976865, + "grad_norm": 0.848534345626831, + "learning_rate": 4.686931677685637e-06, + "loss": 0.7061, + "step": 2700 + }, + { + "epoch": 0.9971695791287226, + "grad_norm": 0.862679123878479, + "learning_rate": 4.68669597134394e-06, + "loss": 0.7094, + "step": 2701 + }, + { + "epoch": 0.9975387644597588, + "grad_norm": 0.8820730447769165, + "learning_rate": 4.68646018223649e-06, + "loss": 0.6943, + "step": 2702 + }, + { + "epoch": 0.997907949790795, + "grad_norm": 0.8731999397277832, + "learning_rate": 4.686224310372213e-06, + "loss": 0.7011, + "step": 2703 + }, + { + "epoch": 0.9982771351218311, + "grad_norm": 0.852350115776062, + "learning_rate": 4.685988355760035e-06, + "loss": 0.6635, + "step": 2704 + }, + { + "epoch": 0.9986463204528674, + "grad_norm": 0.8917254209518433, + "learning_rate": 4.6857523184088875e-06, + "loss": 0.7044, + "step": 2705 + }, + { + "epoch": 0.9990155057839035, + "grad_norm": 0.900327742099762, + "learning_rate": 4.685516198327705e-06, + "loss": 0.7145, + "step": 2706 + }, + { + "epoch": 0.9993846911149397, + "grad_norm": 0.9128772020339966, + "learning_rate": 4.685279995525424e-06, + "loss": 0.7169, + "step": 2707 + }, + { + "epoch": 0.9997538764459759, + "grad_norm": 0.8816156983375549, + "learning_rate": 4.685043710010985e-06, + "loss": 0.7372, + "step": 2708 + }, + { + "epoch": 1.0, + "grad_norm": 0.8816156983375549, + "learning_rate": 4.684807341793332e-06, + "loss": 0.7025, + "step": 2709 + }, + { + "epoch": 1.0003691853310361, + "grad_norm": 1.1668752431869507, + "learning_rate": 4.684570890881409e-06, + "loss": 0.6923, + "step": 2710 + }, + { + "epoch": 1.0007383706620723, + "grad_norm": 0.992971658706665, + "learning_rate": 4.684334357284168e-06, + "loss": 0.6974, + "step": 2711 + }, + { + "epoch": 1.0011075559931086, + "grad_norm": 0.8744128942489624, + "learning_rate": 4.684097741010562e-06, + "loss": 0.6578, + "step": 2712 + }, + { + "epoch": 1.0014767413241448, + "grad_norm": 0.876900315284729, + "learning_rate": 4.683861042069544e-06, + "loss": 0.7005, + "step": 2713 + }, + { + "epoch": 1.001845926655181, + "grad_norm": 0.8856205344200134, + "learning_rate": 4.683624260470075e-06, + "loss": 0.7139, + "step": 2714 + }, + { + "epoch": 1.002215111986217, + "grad_norm": 0.8850727677345276, + "learning_rate": 4.683387396221118e-06, + "loss": 0.6535, + "step": 2715 + }, + { + "epoch": 1.0025842973172532, + "grad_norm": 0.8519180417060852, + "learning_rate": 4.683150449331637e-06, + "loss": 0.6936, + "step": 2716 + }, + { + "epoch": 1.0029534826482895, + "grad_norm": 0.8828544020652771, + "learning_rate": 4.6829134198106e-06, + "loss": 0.6987, + "step": 2717 + }, + { + "epoch": 1.0033226679793257, + "grad_norm": 0.953484296798706, + "learning_rate": 4.682676307666979e-06, + "loss": 0.7114, + "step": 2718 + }, + { + "epoch": 1.0036918533103618, + "grad_norm": 0.9129650592803955, + "learning_rate": 4.682439112909749e-06, + "loss": 0.6723, + "step": 2719 + }, + { + "epoch": 1.004061038641398, + "grad_norm": 0.860188901424408, + "learning_rate": 4.682201835547887e-06, + "loss": 0.6909, + "step": 2720 + }, + { + "epoch": 1.004430223972434, + "grad_norm": 0.8592462539672852, + "learning_rate": 4.6819644755903746e-06, + "loss": 0.6667, + "step": 2721 + }, + { + "epoch": 1.0047994093034704, + "grad_norm": 0.8863035440444946, + "learning_rate": 4.681727033046195e-06, + "loss": 0.6953, + "step": 2722 + }, + { + "epoch": 1.0051685946345066, + "grad_norm": 0.8873715996742249, + "learning_rate": 4.681489507924336e-06, + "loss": 0.7, + "step": 2723 + }, + { + "epoch": 1.0055377799655427, + "grad_norm": 0.8978347778320312, + "learning_rate": 4.681251900233788e-06, + "loss": 0.6675, + "step": 2724 + }, + { + "epoch": 1.0059069652965789, + "grad_norm": 0.8402617573738098, + "learning_rate": 4.681014209983543e-06, + "loss": 0.6412, + "step": 2725 + }, + { + "epoch": 1.006276150627615, + "grad_norm": 0.8753883242607117, + "learning_rate": 4.680776437182599e-06, + "loss": 0.6723, + "step": 2726 + }, + { + "epoch": 1.0066453359586511, + "grad_norm": 0.8595938086509705, + "learning_rate": 4.680538581839955e-06, + "loss": 0.6821, + "step": 2727 + }, + { + "epoch": 1.0070145212896875, + "grad_norm": 0.8935231566429138, + "learning_rate": 4.6803006439646145e-06, + "loss": 0.6874, + "step": 2728 + }, + { + "epoch": 1.0073837066207236, + "grad_norm": 0.8682321906089783, + "learning_rate": 4.6800626235655825e-06, + "loss": 0.674, + "step": 2729 + }, + { + "epoch": 1.0077528919517598, + "grad_norm": 0.8434739708900452, + "learning_rate": 4.679824520651868e-06, + "loss": 0.6583, + "step": 2730 + }, + { + "epoch": 1.008122077282796, + "grad_norm": 0.8962522745132446, + "learning_rate": 4.679586335232484e-06, + "loss": 0.6327, + "step": 2731 + }, + { + "epoch": 1.008491262613832, + "grad_norm": 0.8714585900306702, + "learning_rate": 4.679348067316444e-06, + "loss": 0.6715, + "step": 2732 + }, + { + "epoch": 1.0088604479448684, + "grad_norm": 0.8924304246902466, + "learning_rate": 4.679109716912768e-06, + "loss": 0.6638, + "step": 2733 + }, + { + "epoch": 1.0092296332759045, + "grad_norm": 0.8760807514190674, + "learning_rate": 4.678871284030476e-06, + "loss": 0.6646, + "step": 2734 + }, + { + "epoch": 1.0095988186069407, + "grad_norm": 0.8640549182891846, + "learning_rate": 4.678632768678595e-06, + "loss": 0.6524, + "step": 2735 + }, + { + "epoch": 1.0099680039379768, + "grad_norm": 0.9103869199752808, + "learning_rate": 4.67839417086615e-06, + "loss": 0.6819, + "step": 2736 + }, + { + "epoch": 1.010337189269013, + "grad_norm": 0.8370369076728821, + "learning_rate": 4.678155490602174e-06, + "loss": 0.6791, + "step": 2737 + }, + { + "epoch": 1.0107063746000493, + "grad_norm": 0.8959705233573914, + "learning_rate": 4.677916727895701e-06, + "loss": 0.6624, + "step": 2738 + }, + { + "epoch": 1.0110755599310854, + "grad_norm": 0.8826505541801453, + "learning_rate": 4.677677882755765e-06, + "loss": 0.6403, + "step": 2739 + }, + { + "epoch": 1.0114447452621216, + "grad_norm": 0.8534560203552246, + "learning_rate": 4.67743895519141e-06, + "loss": 0.6617, + "step": 2740 + }, + { + "epoch": 1.0118139305931577, + "grad_norm": 0.8508325815200806, + "learning_rate": 4.677199945211678e-06, + "loss": 0.6706, + "step": 2741 + }, + { + "epoch": 1.0121831159241939, + "grad_norm": 0.8714326024055481, + "learning_rate": 4.676960852825614e-06, + "loss": 0.7062, + "step": 2742 + }, + { + "epoch": 1.0125523012552302, + "grad_norm": 0.8997061848640442, + "learning_rate": 4.67672167804227e-06, + "loss": 0.6854, + "step": 2743 + }, + { + "epoch": 1.0129214865862664, + "grad_norm": 0.8811880946159363, + "learning_rate": 4.676482420870697e-06, + "loss": 0.6854, + "step": 2744 + }, + { + "epoch": 1.0132906719173025, + "grad_norm": 0.8865457773208618, + "learning_rate": 4.676243081319951e-06, + "loss": 0.6852, + "step": 2745 + }, + { + "epoch": 1.0136598572483386, + "grad_norm": 0.9185889363288879, + "learning_rate": 4.676003659399091e-06, + "loss": 0.6957, + "step": 2746 + }, + { + "epoch": 1.0140290425793748, + "grad_norm": 0.8736449480056763, + "learning_rate": 4.675764155117179e-06, + "loss": 0.6637, + "step": 2747 + }, + { + "epoch": 1.0143982279104111, + "grad_norm": 0.9179585576057434, + "learning_rate": 4.675524568483282e-06, + "loss": 0.6923, + "step": 2748 + }, + { + "epoch": 1.0147674132414473, + "grad_norm": 0.866867184638977, + "learning_rate": 4.675284899506465e-06, + "loss": 0.7006, + "step": 2749 + }, + { + "epoch": 1.0151365985724834, + "grad_norm": 0.8886509537696838, + "learning_rate": 4.675045148195803e-06, + "loss": 0.6682, + "step": 2750 + }, + { + "epoch": 1.0155057839035195, + "grad_norm": 0.8733989000320435, + "learning_rate": 4.674805314560367e-06, + "loss": 0.6535, + "step": 2751 + }, + { + "epoch": 1.0158749692345557, + "grad_norm": 0.866317629814148, + "learning_rate": 4.674565398609236e-06, + "loss": 0.6842, + "step": 2752 + }, + { + "epoch": 1.016244154565592, + "grad_norm": 0.8644862174987793, + "learning_rate": 4.674325400351492e-06, + "loss": 0.6859, + "step": 2753 + }, + { + "epoch": 1.0166133398966282, + "grad_norm": 0.8511524796485901, + "learning_rate": 4.674085319796217e-06, + "loss": 0.6681, + "step": 2754 + }, + { + "epoch": 1.0169825252276643, + "grad_norm": 0.8777972459793091, + "learning_rate": 4.673845156952499e-06, + "loss": 0.6612, + "step": 2755 + }, + { + "epoch": 1.0173517105587004, + "grad_norm": 0.9387547969818115, + "learning_rate": 4.673604911829428e-06, + "loss": 0.6607, + "step": 2756 + }, + { + "epoch": 1.0177208958897366, + "grad_norm": 0.8887958526611328, + "learning_rate": 4.673364584436098e-06, + "loss": 0.6947, + "step": 2757 + }, + { + "epoch": 1.018090081220773, + "grad_norm": 0.8562855124473572, + "learning_rate": 4.673124174781603e-06, + "loss": 0.6915, + "step": 2758 + }, + { + "epoch": 1.018459266551809, + "grad_norm": 0.9101713299751282, + "learning_rate": 4.672883682875045e-06, + "loss": 0.7139, + "step": 2759 + }, + { + "epoch": 1.0188284518828452, + "grad_norm": 0.8841733336448669, + "learning_rate": 4.6726431087255244e-06, + "loss": 0.6898, + "step": 2760 + }, + { + "epoch": 1.0191976372138813, + "grad_norm": 0.8672334551811218, + "learning_rate": 4.672402452342148e-06, + "loss": 0.6605, + "step": 2761 + }, + { + "epoch": 1.0195668225449175, + "grad_norm": 0.8886809349060059, + "learning_rate": 4.672161713734025e-06, + "loss": 0.7086, + "step": 2762 + }, + { + "epoch": 1.0199360078759536, + "grad_norm": 0.8754507899284363, + "learning_rate": 4.671920892910266e-06, + "loss": 0.672, + "step": 2763 + }, + { + "epoch": 1.02030519320699, + "grad_norm": 0.8461178541183472, + "learning_rate": 4.671679989879987e-06, + "loss": 0.7062, + "step": 2764 + }, + { + "epoch": 1.0206743785380261, + "grad_norm": 0.8500531315803528, + "learning_rate": 4.6714390046523055e-06, + "loss": 0.6108, + "step": 2765 + }, + { + "epoch": 1.0210435638690623, + "grad_norm": 0.8619534969329834, + "learning_rate": 4.671197937236343e-06, + "loss": 0.6607, + "step": 2766 + }, + { + "epoch": 1.0214127492000984, + "grad_norm": 0.8798885941505432, + "learning_rate": 4.670956787641224e-06, + "loss": 0.693, + "step": 2767 + }, + { + "epoch": 1.0217819345311345, + "grad_norm": 0.8915190696716309, + "learning_rate": 4.670715555876077e-06, + "loss": 0.6821, + "step": 2768 + }, + { + "epoch": 1.0221511198621709, + "grad_norm": 0.8591749668121338, + "learning_rate": 4.670474241950029e-06, + "loss": 0.678, + "step": 2769 + }, + { + "epoch": 1.022520305193207, + "grad_norm": 0.8759719133377075, + "learning_rate": 4.670232845872217e-06, + "loss": 0.6896, + "step": 2770 + }, + { + "epoch": 1.0228894905242432, + "grad_norm": 0.884102463722229, + "learning_rate": 4.669991367651777e-06, + "loss": 0.6461, + "step": 2771 + }, + { + "epoch": 1.0232586758552793, + "grad_norm": 0.8832391500473022, + "learning_rate": 4.6697498072978484e-06, + "loss": 0.6683, + "step": 2772 + }, + { + "epoch": 1.0236278611863154, + "grad_norm": 0.9172348976135254, + "learning_rate": 4.669508164819574e-06, + "loss": 0.667, + "step": 2773 + }, + { + "epoch": 1.0239970465173518, + "grad_norm": 0.8954014778137207, + "learning_rate": 4.669266440226102e-06, + "loss": 0.6653, + "step": 2774 + }, + { + "epoch": 1.024366231848388, + "grad_norm": 0.8748698830604553, + "learning_rate": 4.669024633526578e-06, + "loss": 0.6948, + "step": 2775 + }, + { + "epoch": 1.024735417179424, + "grad_norm": 0.8959664106369019, + "learning_rate": 4.668782744730156e-06, + "loss": 0.6611, + "step": 2776 + }, + { + "epoch": 1.0251046025104602, + "grad_norm": 0.8544363379478455, + "learning_rate": 4.668540773845992e-06, + "loss": 0.6573, + "step": 2777 + }, + { + "epoch": 1.0254737878414963, + "grad_norm": 0.8482572436332703, + "learning_rate": 4.668298720883244e-06, + "loss": 0.7281, + "step": 2778 + }, + { + "epoch": 1.0258429731725327, + "grad_norm": 0.8772283792495728, + "learning_rate": 4.668056585851075e-06, + "loss": 0.6602, + "step": 2779 + }, + { + "epoch": 1.0262121585035688, + "grad_norm": 0.8639369606971741, + "learning_rate": 4.667814368758648e-06, + "loss": 0.66, + "step": 2780 + }, + { + "epoch": 1.026581343834605, + "grad_norm": 0.8619779944419861, + "learning_rate": 4.66757206961513e-06, + "loss": 0.6254, + "step": 2781 + }, + { + "epoch": 1.0269505291656411, + "grad_norm": 0.8503388166427612, + "learning_rate": 4.667329688429694e-06, + "loss": 0.6661, + "step": 2782 + }, + { + "epoch": 1.0273197144966772, + "grad_norm": 0.8873377442359924, + "learning_rate": 4.667087225211514e-06, + "loss": 0.6709, + "step": 2783 + }, + { + "epoch": 1.0276888998277136, + "grad_norm": 0.8564530611038208, + "learning_rate": 4.666844679969765e-06, + "loss": 0.6775, + "step": 2784 + }, + { + "epoch": 1.0280580851587497, + "grad_norm": 0.9312854409217834, + "learning_rate": 4.66660205271363e-06, + "loss": 0.6542, + "step": 2785 + }, + { + "epoch": 1.0284272704897859, + "grad_norm": 0.9055665731430054, + "learning_rate": 4.666359343452291e-06, + "loss": 0.6633, + "step": 2786 + }, + { + "epoch": 1.028796455820822, + "grad_norm": 0.8719002604484558, + "learning_rate": 4.6661165521949345e-06, + "loss": 0.6888, + "step": 2787 + }, + { + "epoch": 1.0291656411518582, + "grad_norm": 0.8875705599784851, + "learning_rate": 4.66587367895075e-06, + "loss": 0.6614, + "step": 2788 + }, + { + "epoch": 1.0295348264828945, + "grad_norm": 0.8961500525474548, + "learning_rate": 4.66563072372893e-06, + "loss": 0.6584, + "step": 2789 + }, + { + "epoch": 1.0299040118139307, + "grad_norm": 0.895921528339386, + "learning_rate": 4.6653876865386704e-06, + "loss": 0.6615, + "step": 2790 + }, + { + "epoch": 1.0302731971449668, + "grad_norm": 0.8877645134925842, + "learning_rate": 4.665144567389171e-06, + "loss": 0.6416, + "step": 2791 + }, + { + "epoch": 1.030642382476003, + "grad_norm": 0.8500512838363647, + "learning_rate": 4.664901366289633e-06, + "loss": 0.6618, + "step": 2792 + }, + { + "epoch": 1.031011567807039, + "grad_norm": 0.8770802021026611, + "learning_rate": 4.6646580832492616e-06, + "loss": 0.6836, + "step": 2793 + }, + { + "epoch": 1.0313807531380754, + "grad_norm": 0.8622767329216003, + "learning_rate": 4.664414718277264e-06, + "loss": 0.6847, + "step": 2794 + }, + { + "epoch": 1.0317499384691116, + "grad_norm": 0.9275234341621399, + "learning_rate": 4.664171271382854e-06, + "loss": 0.6945, + "step": 2795 + }, + { + "epoch": 1.0321191238001477, + "grad_norm": 0.9547581672668457, + "learning_rate": 4.6639277425752425e-06, + "loss": 0.6751, + "step": 2796 + }, + { + "epoch": 1.0324883091311838, + "grad_norm": 0.8790690898895264, + "learning_rate": 4.66368413186365e-06, + "loss": 0.6804, + "step": 2797 + }, + { + "epoch": 1.03285749446222, + "grad_norm": 0.8583581447601318, + "learning_rate": 4.663440439257296e-06, + "loss": 0.6443, + "step": 2798 + }, + { + "epoch": 1.033226679793256, + "grad_norm": 0.8776803016662598, + "learning_rate": 4.663196664765404e-06, + "loss": 0.685, + "step": 2799 + }, + { + "epoch": 1.0335958651242925, + "grad_norm": 0.9127188920974731, + "learning_rate": 4.662952808397201e-06, + "loss": 0.6514, + "step": 2800 + }, + { + "epoch": 1.0339650504553286, + "grad_norm": 0.9128128290176392, + "learning_rate": 4.662708870161916e-06, + "loss": 0.6934, + "step": 2801 + }, + { + "epoch": 1.0343342357863647, + "grad_norm": 0.9004178047180176, + "learning_rate": 4.662464850068784e-06, + "loss": 0.6892, + "step": 2802 + }, + { + "epoch": 1.0347034211174009, + "grad_norm": 0.8885177969932556, + "learning_rate": 4.66222074812704e-06, + "loss": 0.6409, + "step": 2803 + }, + { + "epoch": 1.035072606448437, + "grad_norm": 0.8857535719871521, + "learning_rate": 4.661976564345923e-06, + "loss": 0.6504, + "step": 2804 + }, + { + "epoch": 1.0354417917794734, + "grad_norm": 0.8746016621589661, + "learning_rate": 4.661732298734674e-06, + "loss": 0.7247, + "step": 2805 + }, + { + "epoch": 1.0358109771105095, + "grad_norm": 0.9259986877441406, + "learning_rate": 4.661487951302541e-06, + "loss": 0.6431, + "step": 2806 + }, + { + "epoch": 1.0361801624415456, + "grad_norm": 0.8784388899803162, + "learning_rate": 4.661243522058771e-06, + "loss": 0.7268, + "step": 2807 + }, + { + "epoch": 1.0365493477725818, + "grad_norm": 0.8684430122375488, + "learning_rate": 4.6609990110126155e-06, + "loss": 0.7038, + "step": 2808 + }, + { + "epoch": 1.036918533103618, + "grad_norm": 0.8909395337104797, + "learning_rate": 4.66075441817333e-06, + "loss": 0.6559, + "step": 2809 + }, + { + "epoch": 1.0372877184346543, + "grad_norm": 0.8084502220153809, + "learning_rate": 4.660509743550171e-06, + "loss": 0.6547, + "step": 2810 + }, + { + "epoch": 1.0376569037656904, + "grad_norm": 0.873261034488678, + "learning_rate": 4.660264987152401e-06, + "loss": 0.69, + "step": 2811 + }, + { + "epoch": 1.0380260890967266, + "grad_norm": 0.8579702973365784, + "learning_rate": 4.660020148989282e-06, + "loss": 0.6655, + "step": 2812 + }, + { + "epoch": 1.0383952744277627, + "grad_norm": 0.8724539875984192, + "learning_rate": 4.659775229070082e-06, + "loss": 0.6684, + "step": 2813 + }, + { + "epoch": 1.0387644597587988, + "grad_norm": 0.8407760262489319, + "learning_rate": 4.6595302274040725e-06, + "loss": 0.6622, + "step": 2814 + }, + { + "epoch": 1.0391336450898352, + "grad_norm": 0.875041663646698, + "learning_rate": 4.659285144000525e-06, + "loss": 0.6764, + "step": 2815 + }, + { + "epoch": 1.0395028304208713, + "grad_norm": 0.9029369354248047, + "learning_rate": 4.659039978868716e-06, + "loss": 0.6974, + "step": 2816 + }, + { + "epoch": 1.0398720157519075, + "grad_norm": 0.8669332265853882, + "learning_rate": 4.658794732017925e-06, + "loss": 0.6628, + "step": 2817 + }, + { + "epoch": 1.0402412010829436, + "grad_norm": 0.8982745409011841, + "learning_rate": 4.658549403457435e-06, + "loss": 0.6906, + "step": 2818 + }, + { + "epoch": 1.0406103864139797, + "grad_norm": 0.8255721926689148, + "learning_rate": 4.6583039931965315e-06, + "loss": 0.6636, + "step": 2819 + }, + { + "epoch": 1.040979571745016, + "grad_norm": 0.8601429462432861, + "learning_rate": 4.6580585012445025e-06, + "loss": 0.705, + "step": 2820 + }, + { + "epoch": 1.0413487570760522, + "grad_norm": 0.877720832824707, + "learning_rate": 4.65781292761064e-06, + "loss": 0.648, + "step": 2821 + }, + { + "epoch": 1.0417179424070884, + "grad_norm": 0.9083960652351379, + "learning_rate": 4.657567272304241e-06, + "loss": 0.6656, + "step": 2822 + }, + { + "epoch": 1.0420871277381245, + "grad_norm": 0.8817479014396667, + "learning_rate": 4.6573215353346e-06, + "loss": 0.6418, + "step": 2823 + }, + { + "epoch": 1.0424563130691606, + "grad_norm": 0.8430233001708984, + "learning_rate": 4.65707571671102e-06, + "loss": 0.7005, + "step": 2824 + }, + { + "epoch": 1.042825498400197, + "grad_norm": 0.8740551471710205, + "learning_rate": 4.656829816442805e-06, + "loss": 0.6408, + "step": 2825 + }, + { + "epoch": 1.0431946837312331, + "grad_norm": 0.8574524521827698, + "learning_rate": 4.656583834539262e-06, + "loss": 0.6791, + "step": 2826 + }, + { + "epoch": 1.0435638690622693, + "grad_norm": 0.854712724685669, + "learning_rate": 4.6563377710097024e-06, + "loss": 0.654, + "step": 2827 + }, + { + "epoch": 1.0439330543933054, + "grad_norm": 0.8926639556884766, + "learning_rate": 4.656091625863438e-06, + "loss": 0.6813, + "step": 2828 + }, + { + "epoch": 1.0443022397243416, + "grad_norm": 0.8685784935951233, + "learning_rate": 4.655845399109786e-06, + "loss": 0.6491, + "step": 2829 + }, + { + "epoch": 1.0446714250553777, + "grad_norm": 0.8460726141929626, + "learning_rate": 4.655599090758066e-06, + "loss": 0.6593, + "step": 2830 + }, + { + "epoch": 1.045040610386414, + "grad_norm": 0.8725365400314331, + "learning_rate": 4.655352700817601e-06, + "loss": 0.6602, + "step": 2831 + }, + { + "epoch": 1.0454097957174502, + "grad_norm": 0.9025354385375977, + "learning_rate": 4.655106229297717e-06, + "loss": 0.689, + "step": 2832 + }, + { + "epoch": 1.0457789810484863, + "grad_norm": 0.8983410596847534, + "learning_rate": 4.6548596762077415e-06, + "loss": 0.6701, + "step": 2833 + }, + { + "epoch": 1.0461481663795225, + "grad_norm": 0.9098207950592041, + "learning_rate": 4.654613041557008e-06, + "loss": 0.7346, + "step": 2834 + }, + { + "epoch": 1.0465173517105586, + "grad_norm": 0.8481409549713135, + "learning_rate": 4.65436632535485e-06, + "loss": 0.6669, + "step": 2835 + }, + { + "epoch": 1.046886537041595, + "grad_norm": 0.8799389600753784, + "learning_rate": 4.654119527610608e-06, + "loss": 0.6634, + "step": 2836 + }, + { + "epoch": 1.047255722372631, + "grad_norm": 0.8397617936134338, + "learning_rate": 4.65387264833362e-06, + "loss": 0.6423, + "step": 2837 + }, + { + "epoch": 1.0476249077036672, + "grad_norm": 0.8797786235809326, + "learning_rate": 4.653625687533233e-06, + "loss": 0.6802, + "step": 2838 + }, + { + "epoch": 1.0479940930347034, + "grad_norm": 0.8861578702926636, + "learning_rate": 4.653378645218793e-06, + "loss": 0.6478, + "step": 2839 + }, + { + "epoch": 1.0483632783657395, + "grad_norm": 0.8413394689559937, + "learning_rate": 4.653131521399651e-06, + "loss": 0.6605, + "step": 2840 + }, + { + "epoch": 1.0487324636967759, + "grad_norm": 0.8685784339904785, + "learning_rate": 4.652884316085161e-06, + "loss": 0.6627, + "step": 2841 + }, + { + "epoch": 1.049101649027812, + "grad_norm": 0.8883354663848877, + "learning_rate": 4.652637029284679e-06, + "loss": 0.6557, + "step": 2842 + }, + { + "epoch": 1.0494708343588481, + "grad_norm": 0.8727714419364929, + "learning_rate": 4.652389661007564e-06, + "loss": 0.665, + "step": 2843 + }, + { + "epoch": 1.0498400196898843, + "grad_norm": 0.9300338625907898, + "learning_rate": 4.6521422112631806e-06, + "loss": 0.6329, + "step": 2844 + }, + { + "epoch": 1.0502092050209204, + "grad_norm": 0.918907880783081, + "learning_rate": 4.651894680060894e-06, + "loss": 0.6626, + "step": 2845 + }, + { + "epoch": 1.0505783903519568, + "grad_norm": 0.8453586101531982, + "learning_rate": 4.651647067410071e-06, + "loss": 0.6189, + "step": 2846 + }, + { + "epoch": 1.050947575682993, + "grad_norm": 0.9038910865783691, + "learning_rate": 4.651399373320087e-06, + "loss": 0.6921, + "step": 2847 + }, + { + "epoch": 1.051316761014029, + "grad_norm": 0.8970220685005188, + "learning_rate": 4.651151597800315e-06, + "loss": 0.6083, + "step": 2848 + }, + { + "epoch": 1.0516859463450652, + "grad_norm": 0.8670986294746399, + "learning_rate": 4.650903740860135e-06, + "loss": 0.666, + "step": 2849 + }, + { + "epoch": 1.0520551316761013, + "grad_norm": 0.9135206341743469, + "learning_rate": 4.650655802508927e-06, + "loss": 0.6559, + "step": 2850 + }, + { + "epoch": 1.0524243170071377, + "grad_norm": 0.8654005527496338, + "learning_rate": 4.650407782756074e-06, + "loss": 0.6628, + "step": 2851 + }, + { + "epoch": 1.0527935023381738, + "grad_norm": 0.8866705894470215, + "learning_rate": 4.650159681610967e-06, + "loss": 0.6616, + "step": 2852 + }, + { + "epoch": 1.05316268766921, + "grad_norm": 0.8775743842124939, + "learning_rate": 4.6499114990829944e-06, + "loss": 0.6874, + "step": 2853 + }, + { + "epoch": 1.053531873000246, + "grad_norm": 0.8597018718719482, + "learning_rate": 4.64966323518155e-06, + "loss": 0.6721, + "step": 2854 + }, + { + "epoch": 1.0539010583312822, + "grad_norm": 0.8523260354995728, + "learning_rate": 4.64941488991603e-06, + "loss": 0.6594, + "step": 2855 + }, + { + "epoch": 1.0542702436623186, + "grad_norm": 0.8759757876396179, + "learning_rate": 4.649166463295834e-06, + "loss": 0.648, + "step": 2856 + }, + { + "epoch": 1.0546394289933547, + "grad_norm": 0.8843923211097717, + "learning_rate": 4.648917955330368e-06, + "loss": 0.7109, + "step": 2857 + }, + { + "epoch": 1.0550086143243909, + "grad_norm": 0.8982211351394653, + "learning_rate": 4.648669366029034e-06, + "loss": 0.6887, + "step": 2858 + }, + { + "epoch": 1.055377799655427, + "grad_norm": 0.8711594939231873, + "learning_rate": 4.6484206954012425e-06, + "loss": 0.645, + "step": 2859 + }, + { + "epoch": 1.0557469849864631, + "grad_norm": 0.8670770525932312, + "learning_rate": 4.648171943456406e-06, + "loss": 0.6941, + "step": 2860 + }, + { + "epoch": 1.0561161703174995, + "grad_norm": 0.8887355327606201, + "learning_rate": 4.64792311020394e-06, + "loss": 0.6732, + "step": 2861 + }, + { + "epoch": 1.0564853556485356, + "grad_norm": 0.8926178812980652, + "learning_rate": 4.647674195653262e-06, + "loss": 0.6551, + "step": 2862 + }, + { + "epoch": 1.0568545409795718, + "grad_norm": 0.8660004138946533, + "learning_rate": 4.647425199813793e-06, + "loss": 0.708, + "step": 2863 + }, + { + "epoch": 1.057223726310608, + "grad_norm": 0.8869983553886414, + "learning_rate": 4.647176122694959e-06, + "loss": 0.6641, + "step": 2864 + }, + { + "epoch": 1.057592911641644, + "grad_norm": 0.8591915369033813, + "learning_rate": 4.646926964306186e-06, + "loss": 0.6133, + "step": 2865 + }, + { + "epoch": 1.0579620969726804, + "grad_norm": 0.8476788401603699, + "learning_rate": 4.646677724656904e-06, + "loss": 0.6777, + "step": 2866 + }, + { + "epoch": 1.0583312823037165, + "grad_norm": 0.8865950703620911, + "learning_rate": 4.646428403756548e-06, + "loss": 0.693, + "step": 2867 + }, + { + "epoch": 1.0587004676347527, + "grad_norm": 0.858095645904541, + "learning_rate": 4.646179001614555e-06, + "loss": 0.6797, + "step": 2868 + }, + { + "epoch": 1.0590696529657888, + "grad_norm": 0.8765902519226074, + "learning_rate": 4.645929518240363e-06, + "loss": 0.6669, + "step": 2869 + }, + { + "epoch": 1.059438838296825, + "grad_norm": 0.8230260610580444, + "learning_rate": 4.645679953643417e-06, + "loss": 0.6557, + "step": 2870 + }, + { + "epoch": 1.059808023627861, + "grad_norm": 0.882996678352356, + "learning_rate": 4.645430307833162e-06, + "loss": 0.7207, + "step": 2871 + }, + { + "epoch": 1.0601772089588974, + "grad_norm": 0.8801681399345398, + "learning_rate": 4.6451805808190464e-06, + "loss": 0.6445, + "step": 2872 + }, + { + "epoch": 1.0605463942899336, + "grad_norm": 0.8836155533790588, + "learning_rate": 4.644930772610523e-06, + "loss": 0.6754, + "step": 2873 + }, + { + "epoch": 1.0609155796209697, + "grad_norm": 0.8495041131973267, + "learning_rate": 4.644680883217046e-06, + "loss": 0.6764, + "step": 2874 + }, + { + "epoch": 1.0612847649520059, + "grad_norm": 0.8706765174865723, + "learning_rate": 4.644430912648076e-06, + "loss": 0.6541, + "step": 2875 + }, + { + "epoch": 1.061653950283042, + "grad_norm": 0.893825352191925, + "learning_rate": 4.644180860913073e-06, + "loss": 0.6547, + "step": 2876 + }, + { + "epoch": 1.0620231356140784, + "grad_norm": 0.8814471960067749, + "learning_rate": 4.6439307280215e-06, + "loss": 0.6529, + "step": 2877 + }, + { + "epoch": 1.0623923209451145, + "grad_norm": 0.9095078706741333, + "learning_rate": 4.643680513982825e-06, + "loss": 0.6519, + "step": 2878 + }, + { + "epoch": 1.0627615062761506, + "grad_norm": 0.87798672914505, + "learning_rate": 4.64343021880652e-06, + "loss": 0.6942, + "step": 2879 + }, + { + "epoch": 1.0631306916071868, + "grad_norm": 0.8998186588287354, + "learning_rate": 4.643179842502058e-06, + "loss": 0.6853, + "step": 2880 + }, + { + "epoch": 1.063499876938223, + "grad_norm": 0.8979682326316833, + "learning_rate": 4.642929385078915e-06, + "loss": 0.6536, + "step": 2881 + }, + { + "epoch": 1.0638690622692593, + "grad_norm": 0.8917797803878784, + "learning_rate": 4.64267884654657e-06, + "loss": 0.7066, + "step": 2882 + }, + { + "epoch": 1.0642382476002954, + "grad_norm": 0.8840612173080444, + "learning_rate": 4.642428226914508e-06, + "loss": 0.6596, + "step": 2883 + }, + { + "epoch": 1.0646074329313315, + "grad_norm": 0.8957401514053345, + "learning_rate": 4.642177526192214e-06, + "loss": 0.6676, + "step": 2884 + }, + { + "epoch": 1.0649766182623677, + "grad_norm": 0.8820780515670776, + "learning_rate": 4.641926744389176e-06, + "loss": 0.6652, + "step": 2885 + }, + { + "epoch": 1.0653458035934038, + "grad_norm": 0.8533323407173157, + "learning_rate": 4.641675881514888e-06, + "loss": 0.6559, + "step": 2886 + }, + { + "epoch": 1.0657149889244402, + "grad_norm": 0.8937400579452515, + "learning_rate": 4.641424937578842e-06, + "loss": 0.6964, + "step": 2887 + }, + { + "epoch": 1.0660841742554763, + "grad_norm": 0.8996254205703735, + "learning_rate": 4.6411739125905395e-06, + "loss": 0.6594, + "step": 2888 + }, + { + "epoch": 1.0664533595865124, + "grad_norm": 0.8615009784698486, + "learning_rate": 4.6409228065594795e-06, + "loss": 0.6913, + "step": 2889 + }, + { + "epoch": 1.0668225449175486, + "grad_norm": 0.8650383353233337, + "learning_rate": 4.640671619495167e-06, + "loss": 0.6527, + "step": 2890 + }, + { + "epoch": 1.0671917302485847, + "grad_norm": 0.8498789072036743, + "learning_rate": 4.640420351407109e-06, + "loss": 0.6912, + "step": 2891 + }, + { + "epoch": 1.067560915579621, + "grad_norm": 0.9019843935966492, + "learning_rate": 4.640169002304817e-06, + "loss": 0.6632, + "step": 2892 + }, + { + "epoch": 1.0679301009106572, + "grad_norm": 0.8868442177772522, + "learning_rate": 4.6399175721978026e-06, + "loss": 0.6737, + "step": 2893 + }, + { + "epoch": 1.0682992862416933, + "grad_norm": 0.872709333896637, + "learning_rate": 4.6396660610955845e-06, + "loss": 0.6594, + "step": 2894 + }, + { + "epoch": 1.0686684715727295, + "grad_norm": 0.9083309173583984, + "learning_rate": 4.639414469007681e-06, + "loss": 0.6875, + "step": 2895 + }, + { + "epoch": 1.0690376569037656, + "grad_norm": 0.859743058681488, + "learning_rate": 4.639162795943614e-06, + "loss": 0.6398, + "step": 2896 + }, + { + "epoch": 1.0694068422348018, + "grad_norm": 0.896567702293396, + "learning_rate": 4.638911041912912e-06, + "loss": 0.6658, + "step": 2897 + }, + { + "epoch": 1.0697760275658381, + "grad_norm": 0.859343409538269, + "learning_rate": 4.638659206925101e-06, + "loss": 0.6917, + "step": 2898 + }, + { + "epoch": 1.0701452128968743, + "grad_norm": 0.9221862554550171, + "learning_rate": 4.638407290989715e-06, + "loss": 0.6779, + "step": 2899 + }, + { + "epoch": 1.0705143982279104, + "grad_norm": 0.8592720031738281, + "learning_rate": 4.638155294116287e-06, + "loss": 0.6874, + "step": 2900 + }, + { + "epoch": 1.0708835835589465, + "grad_norm": 0.8539056181907654, + "learning_rate": 4.637903216314357e-06, + "loss": 0.6588, + "step": 2901 + }, + { + "epoch": 1.0712527688899827, + "grad_norm": 0.9034031629562378, + "learning_rate": 4.637651057593464e-06, + "loss": 0.6981, + "step": 2902 + }, + { + "epoch": 1.071621954221019, + "grad_norm": 0.8809040188789368, + "learning_rate": 4.637398817963153e-06, + "loss": 0.6455, + "step": 2903 + }, + { + "epoch": 1.0719911395520552, + "grad_norm": 0.8502632975578308, + "learning_rate": 4.6371464974329715e-06, + "loss": 0.6449, + "step": 2904 + }, + { + "epoch": 1.0723603248830913, + "grad_norm": 0.8986324071884155, + "learning_rate": 4.636894096012469e-06, + "loss": 0.6762, + "step": 2905 + }, + { + "epoch": 1.0727295102141274, + "grad_norm": 0.8782020807266235, + "learning_rate": 4.636641613711201e-06, + "loss": 0.6865, + "step": 2906 + }, + { + "epoch": 1.0730986955451636, + "grad_norm": 0.8887993097305298, + "learning_rate": 4.636389050538721e-06, + "loss": 0.6806, + "step": 2907 + }, + { + "epoch": 1.0734678808762, + "grad_norm": 0.9132815003395081, + "learning_rate": 4.63613640650459e-06, + "loss": 0.6702, + "step": 2908 + }, + { + "epoch": 1.073837066207236, + "grad_norm": 0.8909634351730347, + "learning_rate": 4.635883681618371e-06, + "loss": 0.627, + "step": 2909 + }, + { + "epoch": 1.0742062515382722, + "grad_norm": 0.89024817943573, + "learning_rate": 4.635630875889627e-06, + "loss": 0.6733, + "step": 2910 + }, + { + "epoch": 1.0745754368693083, + "grad_norm": 0.8756373524665833, + "learning_rate": 4.635377989327929e-06, + "loss": 0.6922, + "step": 2911 + }, + { + "epoch": 1.0749446222003445, + "grad_norm": 0.8728370070457458, + "learning_rate": 4.635125021942849e-06, + "loss": 0.6472, + "step": 2912 + }, + { + "epoch": 1.0753138075313808, + "grad_norm": 0.8689671158790588, + "learning_rate": 4.634871973743961e-06, + "loss": 0.6695, + "step": 2913 + }, + { + "epoch": 1.075682992862417, + "grad_norm": 0.8716274499893188, + "learning_rate": 4.634618844740841e-06, + "loss": 0.6738, + "step": 2914 + }, + { + "epoch": 1.0760521781934531, + "grad_norm": 0.8830373287200928, + "learning_rate": 4.634365634943072e-06, + "loss": 0.6761, + "step": 2915 + }, + { + "epoch": 1.0764213635244892, + "grad_norm": 0.8884536623954773, + "learning_rate": 4.634112344360237e-06, + "loss": 0.6821, + "step": 2916 + }, + { + "epoch": 1.0767905488555254, + "grad_norm": 0.8283507227897644, + "learning_rate": 4.633858973001924e-06, + "loss": 0.6951, + "step": 2917 + }, + { + "epoch": 1.0771597341865617, + "grad_norm": 0.8898729085922241, + "learning_rate": 4.633605520877722e-06, + "loss": 0.6587, + "step": 2918 + }, + { + "epoch": 1.0775289195175979, + "grad_norm": 0.8476880788803101, + "learning_rate": 4.633351987997225e-06, + "loss": 0.6462, + "step": 2919 + }, + { + "epoch": 1.077898104848634, + "grad_norm": 0.8899357914924622, + "learning_rate": 4.633098374370028e-06, + "loss": 0.6798, + "step": 2920 + }, + { + "epoch": 1.0782672901796702, + "grad_norm": 0.8883606195449829, + "learning_rate": 4.632844680005731e-06, + "loss": 0.6519, + "step": 2921 + }, + { + "epoch": 1.0786364755107063, + "grad_norm": 0.8939263820648193, + "learning_rate": 4.632590904913936e-06, + "loss": 0.6866, + "step": 2922 + }, + { + "epoch": 1.0790056608417427, + "grad_norm": 0.9059105515480042, + "learning_rate": 4.632337049104248e-06, + "loss": 0.6905, + "step": 2923 + }, + { + "epoch": 1.0793748461727788, + "grad_norm": 0.8568992018699646, + "learning_rate": 4.632083112586276e-06, + "loss": 0.6729, + "step": 2924 + }, + { + "epoch": 1.079744031503815, + "grad_norm": 0.9016958475112915, + "learning_rate": 4.63182909536963e-06, + "loss": 0.6713, + "step": 2925 + }, + { + "epoch": 1.080113216834851, + "grad_norm": 0.8861789107322693, + "learning_rate": 4.631574997463926e-06, + "loss": 0.6771, + "step": 2926 + }, + { + "epoch": 1.0804824021658872, + "grad_norm": 0.8949338793754578, + "learning_rate": 4.631320818878783e-06, + "loss": 0.7396, + "step": 2927 + }, + { + "epoch": 1.0808515874969236, + "grad_norm": 0.9233176708221436, + "learning_rate": 4.631066559623818e-06, + "loss": 0.6646, + "step": 2928 + }, + { + "epoch": 1.0812207728279597, + "grad_norm": 0.8916748762130737, + "learning_rate": 4.630812219708657e-06, + "loss": 0.653, + "step": 2929 + }, + { + "epoch": 1.0815899581589958, + "grad_norm": 0.842766523361206, + "learning_rate": 4.630557799142924e-06, + "loss": 0.6295, + "step": 2930 + }, + { + "epoch": 1.081959143490032, + "grad_norm": 0.8672784566879272, + "learning_rate": 4.6303032979362526e-06, + "loss": 0.6953, + "step": 2931 + }, + { + "epoch": 1.082328328821068, + "grad_norm": 0.8906224370002747, + "learning_rate": 4.6300487160982734e-06, + "loss": 0.6555, + "step": 2932 + }, + { + "epoch": 1.0826975141521045, + "grad_norm": 0.899634063243866, + "learning_rate": 4.629794053638622e-06, + "loss": 0.6952, + "step": 2933 + }, + { + "epoch": 1.0830666994831406, + "grad_norm": 0.9119795560836792, + "learning_rate": 4.629539310566939e-06, + "loss": 0.6798, + "step": 2934 + }, + { + "epoch": 1.0834358848141767, + "grad_norm": 0.8701754212379456, + "learning_rate": 4.629284486892865e-06, + "loss": 0.677, + "step": 2935 + }, + { + "epoch": 1.0838050701452129, + "grad_norm": 0.8475782871246338, + "learning_rate": 4.629029582626044e-06, + "loss": 0.6439, + "step": 2936 + }, + { + "epoch": 1.084174255476249, + "grad_norm": 0.9518386721611023, + "learning_rate": 4.628774597776126e-06, + "loss": 0.6517, + "step": 2937 + }, + { + "epoch": 1.0845434408072854, + "grad_norm": 0.8827958703041077, + "learning_rate": 4.628519532352761e-06, + "loss": 0.7233, + "step": 2938 + }, + { + "epoch": 1.0849126261383215, + "grad_norm": 0.8955361843109131, + "learning_rate": 4.6282643863656036e-06, + "loss": 0.6944, + "step": 2939 + }, + { + "epoch": 1.0852818114693576, + "grad_norm": 1.5501019954681396, + "learning_rate": 4.628009159824311e-06, + "loss": 0.6572, + "step": 2940 + }, + { + "epoch": 1.0856509968003938, + "grad_norm": 0.8493422269821167, + "learning_rate": 4.627753852738543e-06, + "loss": 0.6687, + "step": 2941 + }, + { + "epoch": 1.08602018213143, + "grad_norm": 0.8963118195533752, + "learning_rate": 4.627498465117963e-06, + "loss": 0.6603, + "step": 2942 + }, + { + "epoch": 1.086389367462466, + "grad_norm": 0.8641554713249207, + "learning_rate": 4.627242996972239e-06, + "loss": 0.6659, + "step": 2943 + }, + { + "epoch": 1.0867585527935024, + "grad_norm": 0.9038750529289246, + "learning_rate": 4.626987448311037e-06, + "loss": 0.651, + "step": 2944 + }, + { + "epoch": 1.0871277381245386, + "grad_norm": 0.8862461447715759, + "learning_rate": 4.6267318191440325e-06, + "loss": 0.6405, + "step": 2945 + }, + { + "epoch": 1.0874969234555747, + "grad_norm": 0.8874619603157043, + "learning_rate": 4.6264761094809e-06, + "loss": 0.6466, + "step": 2946 + }, + { + "epoch": 1.0878661087866108, + "grad_norm": 0.8822526335716248, + "learning_rate": 4.626220319331317e-06, + "loss": 0.6729, + "step": 2947 + }, + { + "epoch": 1.088235294117647, + "grad_norm": 0.904630720615387, + "learning_rate": 4.625964448704967e-06, + "loss": 0.6978, + "step": 2948 + }, + { + "epoch": 1.0886044794486833, + "grad_norm": 0.9618430137634277, + "learning_rate": 4.625708497611533e-06, + "loss": 0.6307, + "step": 2949 + }, + { + "epoch": 1.0889736647797195, + "grad_norm": 0.8589624762535095, + "learning_rate": 4.625452466060703e-06, + "loss": 0.6868, + "step": 2950 + }, + { + "epoch": 1.0893428501107556, + "grad_norm": 0.8833630084991455, + "learning_rate": 4.625196354062168e-06, + "loss": 0.6637, + "step": 2951 + }, + { + "epoch": 1.0897120354417917, + "grad_norm": 0.8676459789276123, + "learning_rate": 4.624940161625623e-06, + "loss": 0.7158, + "step": 2952 + }, + { + "epoch": 1.0900812207728279, + "grad_norm": 0.8821964859962463, + "learning_rate": 4.624683888760762e-06, + "loss": 0.6758, + "step": 2953 + }, + { + "epoch": 1.0904504061038642, + "grad_norm": 0.8526460528373718, + "learning_rate": 4.624427535477288e-06, + "loss": 0.6925, + "step": 2954 + }, + { + "epoch": 1.0908195914349004, + "grad_norm": 0.8880017995834351, + "learning_rate": 4.6241711017849e-06, + "loss": 0.6699, + "step": 2955 + }, + { + "epoch": 1.0911887767659365, + "grad_norm": 0.8728770017623901, + "learning_rate": 4.623914587693307e-06, + "loss": 0.6503, + "step": 2956 + }, + { + "epoch": 1.0915579620969726, + "grad_norm": 0.8419942855834961, + "learning_rate": 4.623657993212218e-06, + "loss": 0.6878, + "step": 2957 + }, + { + "epoch": 1.0919271474280088, + "grad_norm": 0.9012809991836548, + "learning_rate": 4.623401318351344e-06, + "loss": 0.6304, + "step": 2958 + }, + { + "epoch": 1.0922963327590451, + "grad_norm": 0.8893666863441467, + "learning_rate": 4.6231445631204e-06, + "loss": 0.6732, + "step": 2959 + }, + { + "epoch": 1.0926655180900813, + "grad_norm": 0.8986644148826599, + "learning_rate": 4.622887727529104e-06, + "loss": 0.6421, + "step": 2960 + }, + { + "epoch": 1.0930347034211174, + "grad_norm": 0.8845360279083252, + "learning_rate": 4.622630811587177e-06, + "loss": 0.6829, + "step": 2961 + }, + { + "epoch": 1.0934038887521536, + "grad_norm": 0.8940880298614502, + "learning_rate": 4.622373815304345e-06, + "loss": 0.6601, + "step": 2962 + }, + { + "epoch": 1.0937730740831897, + "grad_norm": 0.886100172996521, + "learning_rate": 4.622116738690334e-06, + "loss": 0.6483, + "step": 2963 + }, + { + "epoch": 1.0941422594142258, + "grad_norm": 0.8825613260269165, + "learning_rate": 4.621859581754874e-06, + "loss": 0.6128, + "step": 2964 + }, + { + "epoch": 1.0945114447452622, + "grad_norm": 0.8987419009208679, + "learning_rate": 4.621602344507697e-06, + "loss": 0.6691, + "step": 2965 + }, + { + "epoch": 1.0948806300762983, + "grad_norm": 0.8863619565963745, + "learning_rate": 4.621345026958542e-06, + "loss": 0.6708, + "step": 2966 + }, + { + "epoch": 1.0952498154073345, + "grad_norm": 0.8751677870750427, + "learning_rate": 4.621087629117148e-06, + "loss": 0.655, + "step": 2967 + }, + { + "epoch": 1.0956190007383706, + "grad_norm": 0.8767642378807068, + "learning_rate": 4.620830150993255e-06, + "loss": 0.6719, + "step": 2968 + }, + { + "epoch": 1.0959881860694067, + "grad_norm": 0.8525567054748535, + "learning_rate": 4.620572592596612e-06, + "loss": 0.6595, + "step": 2969 + }, + { + "epoch": 1.096357371400443, + "grad_norm": 0.8528827428817749, + "learning_rate": 4.620314953936964e-06, + "loss": 0.6557, + "step": 2970 + }, + { + "epoch": 1.0967265567314792, + "grad_norm": 0.85749751329422, + "learning_rate": 4.620057235024065e-06, + "loss": 0.626, + "step": 2971 + }, + { + "epoch": 1.0970957420625154, + "grad_norm": 0.8441216945648193, + "learning_rate": 4.619799435867668e-06, + "loss": 0.676, + "step": 2972 + }, + { + "epoch": 1.0974649273935515, + "grad_norm": 0.8669651746749878, + "learning_rate": 4.6195415564775326e-06, + "loss": 0.6904, + "step": 2973 + }, + { + "epoch": 1.0978341127245876, + "grad_norm": 0.9166847467422485, + "learning_rate": 4.619283596863417e-06, + "loss": 0.7244, + "step": 2974 + }, + { + "epoch": 1.098203298055624, + "grad_norm": 0.9193668961524963, + "learning_rate": 4.619025557035087e-06, + "loss": 0.6776, + "step": 2975 + }, + { + "epoch": 1.0985724833866601, + "grad_norm": 0.8603821992874146, + "learning_rate": 4.618767437002308e-06, + "loss": 0.6893, + "step": 2976 + }, + { + "epoch": 1.0989416687176963, + "grad_norm": 0.9150217175483704, + "learning_rate": 4.6185092367748505e-06, + "loss": 0.704, + "step": 2977 + }, + { + "epoch": 1.0993108540487324, + "grad_norm": 0.9158179759979248, + "learning_rate": 4.618250956362487e-06, + "loss": 0.6718, + "step": 2978 + }, + { + "epoch": 1.0996800393797685, + "grad_norm": 0.8916653394699097, + "learning_rate": 4.617992595774993e-06, + "loss": 0.657, + "step": 2979 + }, + { + "epoch": 1.100049224710805, + "grad_norm": 0.8508052229881287, + "learning_rate": 4.6177341550221476e-06, + "loss": 0.6906, + "step": 2980 + }, + { + "epoch": 1.100418410041841, + "grad_norm": 0.8895897269248962, + "learning_rate": 4.617475634113733e-06, + "loss": 0.6637, + "step": 2981 + }, + { + "epoch": 1.1007875953728772, + "grad_norm": 0.8731436729431152, + "learning_rate": 4.6172170330595344e-06, + "loss": 0.6861, + "step": 2982 + }, + { + "epoch": 1.1011567807039133, + "grad_norm": 0.907616913318634, + "learning_rate": 4.616958351869339e-06, + "loss": 0.6776, + "step": 2983 + }, + { + "epoch": 1.1015259660349495, + "grad_norm": 0.8344727158546448, + "learning_rate": 4.616699590552938e-06, + "loss": 0.6667, + "step": 2984 + }, + { + "epoch": 1.1018951513659858, + "grad_norm": 0.8614858984947205, + "learning_rate": 4.616440749120126e-06, + "loss": 0.6296, + "step": 2985 + }, + { + "epoch": 1.102264336697022, + "grad_norm": 0.8393322229385376, + "learning_rate": 4.616181827580699e-06, + "loss": 0.6892, + "step": 2986 + }, + { + "epoch": 1.102633522028058, + "grad_norm": 0.9356122016906738, + "learning_rate": 4.615922825944458e-06, + "loss": 0.6507, + "step": 2987 + }, + { + "epoch": 1.1030027073590942, + "grad_norm": 0.859305739402771, + "learning_rate": 4.615663744221205e-06, + "loss": 0.6226, + "step": 2988 + }, + { + "epoch": 1.1033718926901304, + "grad_norm": 0.8649137616157532, + "learning_rate": 4.615404582420748e-06, + "loss": 0.6759, + "step": 2989 + }, + { + "epoch": 1.1037410780211667, + "grad_norm": 0.8873231410980225, + "learning_rate": 4.615145340552895e-06, + "loss": 0.6609, + "step": 2990 + }, + { + "epoch": 1.1041102633522029, + "grad_norm": 0.859127938747406, + "learning_rate": 4.614886018627458e-06, + "loss": 0.6776, + "step": 2991 + }, + { + "epoch": 1.104479448683239, + "grad_norm": 0.8877171874046326, + "learning_rate": 4.614626616654253e-06, + "loss": 0.7006, + "step": 2992 + }, + { + "epoch": 1.1048486340142751, + "grad_norm": 0.8923121690750122, + "learning_rate": 4.614367134643098e-06, + "loss": 0.6884, + "step": 2993 + }, + { + "epoch": 1.1052178193453113, + "grad_norm": 0.9147962331771851, + "learning_rate": 4.614107572603815e-06, + "loss": 0.6534, + "step": 2994 + }, + { + "epoch": 1.1055870046763476, + "grad_norm": 0.8767805695533752, + "learning_rate": 4.613847930546226e-06, + "loss": 0.6818, + "step": 2995 + }, + { + "epoch": 1.1059561900073838, + "grad_norm": 0.8978078365325928, + "learning_rate": 4.613588208480161e-06, + "loss": 0.6591, + "step": 2996 + }, + { + "epoch": 1.10632537533842, + "grad_norm": 0.8833415508270264, + "learning_rate": 4.613328406415448e-06, + "loss": 0.685, + "step": 2997 + }, + { + "epoch": 1.106694560669456, + "grad_norm": 0.9047439694404602, + "learning_rate": 4.613068524361923e-06, + "loss": 0.691, + "step": 2998 + }, + { + "epoch": 1.1070637460004922, + "grad_norm": 0.8956855535507202, + "learning_rate": 4.612808562329422e-06, + "loss": 0.682, + "step": 2999 + }, + { + "epoch": 1.1074329313315285, + "grad_norm": 0.8967129588127136, + "learning_rate": 4.612548520327782e-06, + "loss": 0.6664, + "step": 3000 + }, + { + "epoch": 1.1078021166625647, + "grad_norm": 0.8797708749771118, + "learning_rate": 4.612288398366848e-06, + "loss": 0.6491, + "step": 3001 + }, + { + "epoch": 1.1081713019936008, + "grad_norm": 0.8553269505500793, + "learning_rate": 4.612028196456465e-06, + "loss": 0.7016, + "step": 3002 + }, + { + "epoch": 1.108540487324637, + "grad_norm": 0.928281843662262, + "learning_rate": 4.611767914606481e-06, + "loss": 0.6842, + "step": 3003 + }, + { + "epoch": 1.108909672655673, + "grad_norm": 0.928591787815094, + "learning_rate": 4.611507552826748e-06, + "loss": 0.6608, + "step": 3004 + }, + { + "epoch": 1.1092788579867094, + "grad_norm": 0.8698152899742126, + "learning_rate": 4.6112471111271205e-06, + "loss": 0.6271, + "step": 3005 + }, + { + "epoch": 1.1096480433177456, + "grad_norm": 0.909089982509613, + "learning_rate": 4.610986589517457e-06, + "loss": 0.6666, + "step": 3006 + }, + { + "epoch": 1.1100172286487817, + "grad_norm": 0.8821618556976318, + "learning_rate": 4.610725988007616e-06, + "loss": 0.6519, + "step": 3007 + }, + { + "epoch": 1.1103864139798179, + "grad_norm": 0.8714765906333923, + "learning_rate": 4.6104653066074635e-06, + "loss": 0.6668, + "step": 3008 + }, + { + "epoch": 1.110755599310854, + "grad_norm": 0.8812129497528076, + "learning_rate": 4.610204545326865e-06, + "loss": 0.6763, + "step": 3009 + }, + { + "epoch": 1.1111247846418901, + "grad_norm": 0.8672915697097778, + "learning_rate": 4.60994370417569e-06, + "loss": 0.656, + "step": 3010 + }, + { + "epoch": 1.1114939699729265, + "grad_norm": 0.8619241714477539, + "learning_rate": 4.609682783163813e-06, + "loss": 0.6631, + "step": 3011 + }, + { + "epoch": 1.1118631553039626, + "grad_norm": 0.8665770888328552, + "learning_rate": 4.6094217823011066e-06, + "loss": 0.7068, + "step": 3012 + }, + { + "epoch": 1.1122323406349988, + "grad_norm": 0.8892896175384521, + "learning_rate": 4.6091607015974525e-06, + "loss": 0.6232, + "step": 3013 + }, + { + "epoch": 1.112601525966035, + "grad_norm": 0.888497531414032, + "learning_rate": 4.608899541062731e-06, + "loss": 0.6746, + "step": 3014 + }, + { + "epoch": 1.112970711297071, + "grad_norm": 0.8950848579406738, + "learning_rate": 4.608638300706828e-06, + "loss": 0.6485, + "step": 3015 + }, + { + "epoch": 1.1133398966281074, + "grad_norm": 0.8799687027931213, + "learning_rate": 4.608376980539631e-06, + "loss": 0.6755, + "step": 3016 + }, + { + "epoch": 1.1137090819591435, + "grad_norm": 0.8951284289360046, + "learning_rate": 4.60811558057103e-06, + "loss": 0.7016, + "step": 3017 + }, + { + "epoch": 1.1140782672901797, + "grad_norm": 0.874884843826294, + "learning_rate": 4.60785410081092e-06, + "loss": 0.6493, + "step": 3018 + }, + { + "epoch": 1.1144474526212158, + "grad_norm": 0.877614438533783, + "learning_rate": 4.607592541269198e-06, + "loss": 0.6818, + "step": 3019 + }, + { + "epoch": 1.114816637952252, + "grad_norm": 0.8698077201843262, + "learning_rate": 4.6073309019557624e-06, + "loss": 0.6693, + "step": 3020 + }, + { + "epoch": 1.1151858232832883, + "grad_norm": 0.8859339952468872, + "learning_rate": 4.607069182880518e-06, + "loss": 0.6645, + "step": 3021 + }, + { + "epoch": 1.1155550086143244, + "grad_norm": 0.9041199684143066, + "learning_rate": 4.606807384053369e-06, + "loss": 0.683, + "step": 3022 + }, + { + "epoch": 1.1159241939453606, + "grad_norm": 0.8573592305183411, + "learning_rate": 4.606545505484227e-06, + "loss": 0.6711, + "step": 3023 + }, + { + "epoch": 1.1162933792763967, + "grad_norm": 0.9114444851875305, + "learning_rate": 4.6062835471830014e-06, + "loss": 0.6613, + "step": 3024 + }, + { + "epoch": 1.1166625646074329, + "grad_norm": 0.8586018085479736, + "learning_rate": 4.6060215091596086e-06, + "loss": 0.6718, + "step": 3025 + }, + { + "epoch": 1.1170317499384692, + "grad_norm": 0.857578694820404, + "learning_rate": 4.605759391423966e-06, + "loss": 0.6404, + "step": 3026 + }, + { + "epoch": 1.1174009352695053, + "grad_norm": 0.8791587352752686, + "learning_rate": 4.605497193985996e-06, + "loss": 0.6504, + "step": 3027 + }, + { + "epoch": 1.1177701206005415, + "grad_norm": 0.8619233965873718, + "learning_rate": 4.605234916855621e-06, + "loss": 0.6876, + "step": 3028 + }, + { + "epoch": 1.1181393059315776, + "grad_norm": 0.8764036297798157, + "learning_rate": 4.604972560042769e-06, + "loss": 0.6876, + "step": 3029 + }, + { + "epoch": 1.1185084912626138, + "grad_norm": 0.8742419481277466, + "learning_rate": 4.60471012355737e-06, + "loss": 0.6854, + "step": 3030 + }, + { + "epoch": 1.11887767659365, + "grad_norm": 0.8788050413131714, + "learning_rate": 4.6044476074093565e-06, + "loss": 0.679, + "step": 3031 + }, + { + "epoch": 1.1192468619246863, + "grad_norm": 0.9018889665603638, + "learning_rate": 4.604185011608666e-06, + "loss": 0.6584, + "step": 3032 + }, + { + "epoch": 1.1196160472557224, + "grad_norm": 0.8427954912185669, + "learning_rate": 4.6039223361652364e-06, + "loss": 0.6675, + "step": 3033 + }, + { + "epoch": 1.1199852325867585, + "grad_norm": 0.8586843609809875, + "learning_rate": 4.603659581089011e-06, + "loss": 0.635, + "step": 3034 + }, + { + "epoch": 1.1203544179177947, + "grad_norm": 0.8667201399803162, + "learning_rate": 4.603396746389933e-06, + "loss": 0.6661, + "step": 3035 + }, + { + "epoch": 1.1207236032488308, + "grad_norm": 0.9080908894538879, + "learning_rate": 4.603133832077953e-06, + "loss": 0.6476, + "step": 3036 + }, + { + "epoch": 1.1210927885798672, + "grad_norm": 0.8796735405921936, + "learning_rate": 4.6028708381630215e-06, + "loss": 0.6716, + "step": 3037 + }, + { + "epoch": 1.1214619739109033, + "grad_norm": 0.886135458946228, + "learning_rate": 4.602607764655092e-06, + "loss": 0.6902, + "step": 3038 + }, + { + "epoch": 1.1218311592419394, + "grad_norm": 0.8607080578804016, + "learning_rate": 4.602344611564122e-06, + "loss": 0.6566, + "step": 3039 + }, + { + "epoch": 1.1222003445729756, + "grad_norm": 0.8910879492759705, + "learning_rate": 4.602081378900073e-06, + "loss": 0.6305, + "step": 3040 + }, + { + "epoch": 1.1225695299040117, + "grad_norm": 0.8414924144744873, + "learning_rate": 4.6018180666729065e-06, + "loss": 0.6983, + "step": 3041 + }, + { + "epoch": 1.122938715235048, + "grad_norm": 0.8668161034584045, + "learning_rate": 4.601554674892589e-06, + "loss": 0.6426, + "step": 3042 + }, + { + "epoch": 1.1233079005660842, + "grad_norm": 0.8414568901062012, + "learning_rate": 4.601291203569091e-06, + "loss": 0.6561, + "step": 3043 + }, + { + "epoch": 1.1236770858971203, + "grad_norm": 0.9052603840827942, + "learning_rate": 4.601027652712384e-06, + "loss": 0.6688, + "step": 3044 + }, + { + "epoch": 1.1240462712281565, + "grad_norm": 0.8866949677467346, + "learning_rate": 4.600764022332444e-06, + "loss": 0.669, + "step": 3045 + }, + { + "epoch": 1.1244154565591926, + "grad_norm": 0.8745715022087097, + "learning_rate": 4.600500312439247e-06, + "loss": 0.686, + "step": 3046 + }, + { + "epoch": 1.124784641890229, + "grad_norm": 0.912263035774231, + "learning_rate": 4.600236523042778e-06, + "loss": 0.6878, + "step": 3047 + }, + { + "epoch": 1.1251538272212651, + "grad_norm": 0.8800041079521179, + "learning_rate": 4.599972654153018e-06, + "loss": 0.7205, + "step": 3048 + }, + { + "epoch": 1.1255230125523012, + "grad_norm": 0.8946696519851685, + "learning_rate": 4.599708705779957e-06, + "loss": 0.6585, + "step": 3049 + }, + { + "epoch": 1.1258921978833374, + "grad_norm": 0.8898885250091553, + "learning_rate": 4.599444677933583e-06, + "loss": 0.6685, + "step": 3050 + }, + { + "epoch": 1.1262613832143735, + "grad_norm": 0.8790240287780762, + "learning_rate": 4.599180570623892e-06, + "loss": 0.6461, + "step": 3051 + }, + { + "epoch": 1.1266305685454099, + "grad_norm": 0.9005390405654907, + "learning_rate": 4.598916383860878e-06, + "loss": 0.6618, + "step": 3052 + }, + { + "epoch": 1.126999753876446, + "grad_norm": 0.8695049285888672, + "learning_rate": 4.598652117654541e-06, + "loss": 0.6647, + "step": 3053 + }, + { + "epoch": 1.1273689392074822, + "grad_norm": 0.8627796173095703, + "learning_rate": 4.598387772014884e-06, + "loss": 0.6875, + "step": 3054 + }, + { + "epoch": 1.1277381245385183, + "grad_norm": 0.8951735496520996, + "learning_rate": 4.598123346951912e-06, + "loss": 0.6968, + "step": 3055 + }, + { + "epoch": 1.1281073098695544, + "grad_norm": 0.9162535071372986, + "learning_rate": 4.5978588424756345e-06, + "loss": 0.638, + "step": 3056 + }, + { + "epoch": 1.1284764952005908, + "grad_norm": 0.8572717905044556, + "learning_rate": 4.597594258596061e-06, + "loss": 0.6978, + "step": 3057 + }, + { + "epoch": 1.128845680531627, + "grad_norm": 0.8789711594581604, + "learning_rate": 4.5973295953232065e-06, + "loss": 0.6465, + "step": 3058 + }, + { + "epoch": 1.129214865862663, + "grad_norm": 0.8686862587928772, + "learning_rate": 4.597064852667089e-06, + "loss": 0.6551, + "step": 3059 + }, + { + "epoch": 1.1295840511936992, + "grad_norm": 0.9054696559906006, + "learning_rate": 4.5968000306377295e-06, + "loss": 0.6809, + "step": 3060 + }, + { + "epoch": 1.1299532365247353, + "grad_norm": 0.8781320452690125, + "learning_rate": 4.59653512924515e-06, + "loss": 0.6516, + "step": 3061 + }, + { + "epoch": 1.1303224218557717, + "grad_norm": 0.8737232685089111, + "learning_rate": 4.596270148499377e-06, + "loss": 0.6823, + "step": 3062 + }, + { + "epoch": 1.1306916071868078, + "grad_norm": 0.8819912672042847, + "learning_rate": 4.596005088410441e-06, + "loss": 0.6948, + "step": 3063 + }, + { + "epoch": 1.131060792517844, + "grad_norm": 0.8776569366455078, + "learning_rate": 4.5957399489883735e-06, + "loss": 0.6428, + "step": 3064 + }, + { + "epoch": 1.13142997784888, + "grad_norm": 0.842181384563446, + "learning_rate": 4.595474730243211e-06, + "loss": 0.6447, + "step": 3065 + }, + { + "epoch": 1.1317991631799162, + "grad_norm": 0.8719273209571838, + "learning_rate": 4.595209432184991e-06, + "loss": 0.6467, + "step": 3066 + }, + { + "epoch": 1.1321683485109526, + "grad_norm": 0.8822826147079468, + "learning_rate": 4.594944054823755e-06, + "loss": 0.6994, + "step": 3067 + }, + { + "epoch": 1.1325375338419887, + "grad_norm": 0.8981608152389526, + "learning_rate": 4.594678598169549e-06, + "loss": 0.6737, + "step": 3068 + }, + { + "epoch": 1.1329067191730249, + "grad_norm": 0.9050329327583313, + "learning_rate": 4.594413062232417e-06, + "loss": 0.6802, + "step": 3069 + }, + { + "epoch": 1.133275904504061, + "grad_norm": 0.8693145513534546, + "learning_rate": 4.594147447022414e-06, + "loss": 0.6811, + "step": 3070 + }, + { + "epoch": 1.1336450898350972, + "grad_norm": 0.8644801378250122, + "learning_rate": 4.593881752549589e-06, + "loss": 0.6507, + "step": 3071 + }, + { + "epoch": 1.1340142751661335, + "grad_norm": 0.8563529253005981, + "learning_rate": 4.593615978824001e-06, + "loss": 0.6862, + "step": 3072 + }, + { + "epoch": 1.1343834604971696, + "grad_norm": 0.9366843700408936, + "learning_rate": 4.593350125855709e-06, + "loss": 0.6677, + "step": 3073 + }, + { + "epoch": 1.1347526458282058, + "grad_norm": 0.8364158272743225, + "learning_rate": 4.593084193654776e-06, + "loss": 0.6644, + "step": 3074 + }, + { + "epoch": 1.135121831159242, + "grad_norm": 0.9042230248451233, + "learning_rate": 4.592818182231267e-06, + "loss": 0.6344, + "step": 3075 + }, + { + "epoch": 1.135491016490278, + "grad_norm": 0.8691655993461609, + "learning_rate": 4.59255209159525e-06, + "loss": 0.6479, + "step": 3076 + }, + { + "epoch": 1.1358602018213144, + "grad_norm": 0.8839971423149109, + "learning_rate": 4.592285921756796e-06, + "loss": 0.6709, + "step": 3077 + }, + { + "epoch": 1.1362293871523506, + "grad_norm": 0.8699178695678711, + "learning_rate": 4.5920196727259805e-06, + "loss": 0.6581, + "step": 3078 + }, + { + "epoch": 1.1365985724833867, + "grad_norm": 0.8641122579574585, + "learning_rate": 4.591753344512881e-06, + "loss": 0.6279, + "step": 3079 + }, + { + "epoch": 1.1369677578144228, + "grad_norm": 0.856166660785675, + "learning_rate": 4.591486937127577e-06, + "loss": 0.7188, + "step": 3080 + }, + { + "epoch": 1.137336943145459, + "grad_norm": 0.8756340146064758, + "learning_rate": 4.591220450580154e-06, + "loss": 0.674, + "step": 3081 + }, + { + "epoch": 1.137706128476495, + "grad_norm": 0.8835283517837524, + "learning_rate": 4.5909538848806955e-06, + "loss": 0.6826, + "step": 3082 + }, + { + "epoch": 1.1380753138075315, + "grad_norm": 0.882703959941864, + "learning_rate": 4.590687240039292e-06, + "loss": 0.6862, + "step": 3083 + }, + { + "epoch": 1.1384444991385676, + "grad_norm": 0.8612953424453735, + "learning_rate": 4.590420516066038e-06, + "loss": 0.68, + "step": 3084 + }, + { + "epoch": 1.1388136844696037, + "grad_norm": 1.3534997701644897, + "learning_rate": 4.590153712971026e-06, + "loss": 0.6525, + "step": 3085 + }, + { + "epoch": 1.1391828698006399, + "grad_norm": 0.8815191388130188, + "learning_rate": 4.5898868307643555e-06, + "loss": 0.6905, + "step": 3086 + }, + { + "epoch": 1.139552055131676, + "grad_norm": 0.9045290946960449, + "learning_rate": 4.589619869456129e-06, + "loss": 0.6448, + "step": 3087 + }, + { + "epoch": 1.1399212404627124, + "grad_norm": 0.8359318971633911, + "learning_rate": 4.589352829056449e-06, + "loss": 0.6707, + "step": 3088 + }, + { + "epoch": 1.1402904257937485, + "grad_norm": 0.8884540796279907, + "learning_rate": 4.589085709575425e-06, + "loss": 0.6685, + "step": 3089 + }, + { + "epoch": 1.1406596111247846, + "grad_norm": 0.8935374617576599, + "learning_rate": 4.588818511023165e-06, + "loss": 0.7126, + "step": 3090 + }, + { + "epoch": 1.1410287964558208, + "grad_norm": 0.9402269721031189, + "learning_rate": 4.5885512334097835e-06, + "loss": 0.6771, + "step": 3091 + }, + { + "epoch": 1.141397981786857, + "grad_norm": 0.8802783489227295, + "learning_rate": 4.588283876745398e-06, + "loss": 0.7017, + "step": 3092 + }, + { + "epoch": 1.1417671671178933, + "grad_norm": 0.873133659362793, + "learning_rate": 4.588016441040126e-06, + "loss": 0.6598, + "step": 3093 + }, + { + "epoch": 1.1421363524489294, + "grad_norm": 0.9024394750595093, + "learning_rate": 4.5877489263040916e-06, + "loss": 0.6928, + "step": 3094 + }, + { + "epoch": 1.1425055377799656, + "grad_norm": 0.875346302986145, + "learning_rate": 4.587481332547418e-06, + "loss": 0.6608, + "step": 3095 + }, + { + "epoch": 1.1428747231110017, + "grad_norm": 0.8695294260978699, + "learning_rate": 4.5872136597802355e-06, + "loss": 0.6082, + "step": 3096 + }, + { + "epoch": 1.1432439084420378, + "grad_norm": 0.9199824333190918, + "learning_rate": 4.586945908012675e-06, + "loss": 0.7107, + "step": 3097 + }, + { + "epoch": 1.143613093773074, + "grad_norm": 0.9020886421203613, + "learning_rate": 4.58667807725487e-06, + "loss": 0.6799, + "step": 3098 + }, + { + "epoch": 1.1439822791041103, + "grad_norm": 0.8492481708526611, + "learning_rate": 4.5864101675169594e-06, + "loss": 0.6576, + "step": 3099 + }, + { + "epoch": 1.1443514644351465, + "grad_norm": 0.8472234010696411, + "learning_rate": 4.586142178809081e-06, + "loss": 0.6386, + "step": 3100 + }, + { + "epoch": 1.1447206497661826, + "grad_norm": 0.8404666185379028, + "learning_rate": 4.58587411114138e-06, + "loss": 0.6417, + "step": 3101 + }, + { + "epoch": 1.1450898350972187, + "grad_norm": 0.8665240406990051, + "learning_rate": 4.585605964524002e-06, + "loss": 0.678, + "step": 3102 + }, + { + "epoch": 1.1454590204282549, + "grad_norm": 0.878446102142334, + "learning_rate": 4.585337738967098e-06, + "loss": 0.6515, + "step": 3103 + }, + { + "epoch": 1.1458282057592912, + "grad_norm": 0.871772050857544, + "learning_rate": 4.585069434480817e-06, + "loss": 0.6938, + "step": 3104 + }, + { + "epoch": 1.1461973910903274, + "grad_norm": 0.8962779641151428, + "learning_rate": 4.584801051075316e-06, + "loss": 0.668, + "step": 3105 + }, + { + "epoch": 1.1465665764213635, + "grad_norm": 0.8464193344116211, + "learning_rate": 4.584532588760754e-06, + "loss": 0.6532, + "step": 3106 + }, + { + "epoch": 1.1469357617523996, + "grad_norm": 0.8625430464744568, + "learning_rate": 4.584264047547291e-06, + "loss": 0.6698, + "step": 3107 + }, + { + "epoch": 1.1473049470834358, + "grad_norm": 0.9394546747207642, + "learning_rate": 4.583995427445091e-06, + "loss": 0.6774, + "step": 3108 + }, + { + "epoch": 1.1476741324144721, + "grad_norm": 0.8755576610565186, + "learning_rate": 4.5837267284643225e-06, + "loss": 0.6476, + "step": 3109 + }, + { + "epoch": 1.1480433177455083, + "grad_norm": 0.912662148475647, + "learning_rate": 4.5834579506151555e-06, + "loss": 0.6866, + "step": 3110 + }, + { + "epoch": 1.1484125030765444, + "grad_norm": 0.8684141039848328, + "learning_rate": 4.583189093907762e-06, + "loss": 0.671, + "step": 3111 + }, + { + "epoch": 1.1487816884075805, + "grad_norm": 0.8982227444648743, + "learning_rate": 4.5829201583523185e-06, + "loss": 0.6547, + "step": 3112 + }, + { + "epoch": 1.1491508737386167, + "grad_norm": 0.9017815589904785, + "learning_rate": 4.582651143959005e-06, + "loss": 0.679, + "step": 3113 + }, + { + "epoch": 1.149520059069653, + "grad_norm": 0.8895850777626038, + "learning_rate": 4.582382050738003e-06, + "loss": 0.6741, + "step": 3114 + }, + { + "epoch": 1.1498892444006892, + "grad_norm": 0.8657492995262146, + "learning_rate": 4.582112878699498e-06, + "loss": 0.6634, + "step": 3115 + }, + { + "epoch": 1.1502584297317253, + "grad_norm": 0.872909665107727, + "learning_rate": 4.581843627853676e-06, + "loss": 0.6927, + "step": 3116 + }, + { + "epoch": 1.1506276150627615, + "grad_norm": 0.9140523076057434, + "learning_rate": 4.581574298210731e-06, + "loss": 0.718, + "step": 3117 + }, + { + "epoch": 1.1509968003937976, + "grad_norm": 0.8790487051010132, + "learning_rate": 4.581304889780855e-06, + "loss": 0.6775, + "step": 3118 + }, + { + "epoch": 1.151365985724834, + "grad_norm": 0.8707736134529114, + "learning_rate": 4.581035402574246e-06, + "loss": 0.6694, + "step": 3119 + }, + { + "epoch": 1.15173517105587, + "grad_norm": 0.8553259372711182, + "learning_rate": 4.580765836601104e-06, + "loss": 0.6728, + "step": 3120 + }, + { + "epoch": 1.1521043563869062, + "grad_norm": 0.8715519309043884, + "learning_rate": 4.580496191871632e-06, + "loss": 0.6886, + "step": 3121 + }, + { + "epoch": 1.1524735417179424, + "grad_norm": 0.8560150861740112, + "learning_rate": 4.580226468396036e-06, + "loss": 0.6975, + "step": 3122 + }, + { + "epoch": 1.1528427270489785, + "grad_norm": 0.8739336133003235, + "learning_rate": 4.579956666184524e-06, + "loss": 0.6485, + "step": 3123 + }, + { + "epoch": 1.1532119123800149, + "grad_norm": 0.8892291784286499, + "learning_rate": 4.579686785247309e-06, + "loss": 0.6666, + "step": 3124 + }, + { + "epoch": 1.153581097711051, + "grad_norm": 0.8795207142829895, + "learning_rate": 4.579416825594606e-06, + "loss": 0.6875, + "step": 3125 + }, + { + "epoch": 1.1539502830420871, + "grad_norm": 0.8551396727561951, + "learning_rate": 4.579146787236632e-06, + "loss": 0.6582, + "step": 3126 + }, + { + "epoch": 1.1543194683731233, + "grad_norm": 0.88111811876297, + "learning_rate": 4.578876670183608e-06, + "loss": 0.6788, + "step": 3127 + }, + { + "epoch": 1.1546886537041594, + "grad_norm": 0.9169338941574097, + "learning_rate": 4.578606474445758e-06, + "loss": 0.6563, + "step": 3128 + }, + { + "epoch": 1.1550578390351958, + "grad_norm": 0.8605108261108398, + "learning_rate": 4.578336200033309e-06, + "loss": 0.6648, + "step": 3129 + }, + { + "epoch": 1.155427024366232, + "grad_norm": 0.8755785822868347, + "learning_rate": 4.578065846956491e-06, + "loss": 0.6564, + "step": 3130 + }, + { + "epoch": 1.155796209697268, + "grad_norm": 0.8738769888877869, + "learning_rate": 4.5777954152255365e-06, + "loss": 0.6546, + "step": 3131 + }, + { + "epoch": 1.1561653950283042, + "grad_norm": 0.8386809825897217, + "learning_rate": 4.5775249048506815e-06, + "loss": 0.6228, + "step": 3132 + }, + { + "epoch": 1.1565345803593403, + "grad_norm": 0.8720632195472717, + "learning_rate": 4.577254315842165e-06, + "loss": 0.6683, + "step": 3133 + }, + { + "epoch": 1.1569037656903767, + "grad_norm": 0.8456600904464722, + "learning_rate": 4.576983648210227e-06, + "loss": 0.6967, + "step": 3134 + }, + { + "epoch": 1.1572729510214128, + "grad_norm": 0.8863921165466309, + "learning_rate": 4.576712901965114e-06, + "loss": 0.679, + "step": 3135 + }, + { + "epoch": 1.157642136352449, + "grad_norm": 0.8536416888237, + "learning_rate": 4.5764420771170735e-06, + "loss": 0.6769, + "step": 3136 + }, + { + "epoch": 1.158011321683485, + "grad_norm": 0.8529084324836731, + "learning_rate": 4.576171173676355e-06, + "loss": 0.6196, + "step": 3137 + }, + { + "epoch": 1.1583805070145212, + "grad_norm": 0.8536810874938965, + "learning_rate": 4.575900191653213e-06, + "loss": 0.6678, + "step": 3138 + }, + { + "epoch": 1.1587496923455576, + "grad_norm": 0.8682736158370972, + "learning_rate": 4.575629131057905e-06, + "loss": 0.6972, + "step": 3139 + }, + { + "epoch": 1.1591188776765937, + "grad_norm": 0.8907808065414429, + "learning_rate": 4.5753579919006874e-06, + "loss": 0.6452, + "step": 3140 + }, + { + "epoch": 1.1594880630076299, + "grad_norm": 0.8453400731086731, + "learning_rate": 4.575086774191826e-06, + "loss": 0.6619, + "step": 3141 + }, + { + "epoch": 1.159857248338666, + "grad_norm": 0.8657371997833252, + "learning_rate": 4.5748154779415855e-06, + "loss": 0.6383, + "step": 3142 + }, + { + "epoch": 1.1602264336697021, + "grad_norm": 0.8822624087333679, + "learning_rate": 4.574544103160234e-06, + "loss": 0.6691, + "step": 3143 + }, + { + "epoch": 1.1605956190007385, + "grad_norm": 0.8672064542770386, + "learning_rate": 4.574272649858042e-06, + "loss": 0.632, + "step": 3144 + }, + { + "epoch": 1.1609648043317746, + "grad_norm": 0.8377134203910828, + "learning_rate": 4.574001118045285e-06, + "loss": 0.6622, + "step": 3145 + }, + { + "epoch": 1.1613339896628108, + "grad_norm": 0.8876447677612305, + "learning_rate": 4.573729507732241e-06, + "loss": 0.6585, + "step": 3146 + }, + { + "epoch": 1.161703174993847, + "grad_norm": 0.8906093239784241, + "learning_rate": 4.573457818929189e-06, + "loss": 0.6454, + "step": 3147 + }, + { + "epoch": 1.162072360324883, + "grad_norm": 0.8651725053787231, + "learning_rate": 4.573186051646413e-06, + "loss": 0.671, + "step": 3148 + }, + { + "epoch": 1.1624415456559194, + "grad_norm": 0.9006112217903137, + "learning_rate": 4.5729142058942e-06, + "loss": 0.6661, + "step": 3149 + }, + { + "epoch": 1.1628107309869555, + "grad_norm": 0.8941988348960876, + "learning_rate": 4.572642281682838e-06, + "loss": 0.6653, + "step": 3150 + }, + { + "epoch": 1.1631799163179917, + "grad_norm": 0.8998602032661438, + "learning_rate": 4.572370279022619e-06, + "loss": 0.6998, + "step": 3151 + }, + { + "epoch": 1.1635491016490278, + "grad_norm": 0.8866726756095886, + "learning_rate": 4.572098197923839e-06, + "loss": 0.6775, + "step": 3152 + }, + { + "epoch": 1.163918286980064, + "grad_norm": 0.8753821849822998, + "learning_rate": 4.5718260383967975e-06, + "loss": 0.6756, + "step": 3153 + }, + { + "epoch": 1.1642874723111, + "grad_norm": 0.8491327166557312, + "learning_rate": 4.5715538004517935e-06, + "loss": 0.6589, + "step": 3154 + }, + { + "epoch": 1.1646566576421364, + "grad_norm": 0.8599126935005188, + "learning_rate": 4.5712814840991324e-06, + "loss": 0.663, + "step": 3155 + }, + { + "epoch": 1.1650258429731726, + "grad_norm": 0.848477840423584, + "learning_rate": 4.57100908934912e-06, + "loss": 0.6772, + "step": 3156 + }, + { + "epoch": 1.1653950283042087, + "grad_norm": 0.8838089108467102, + "learning_rate": 4.570736616212068e-06, + "loss": 0.6993, + "step": 3157 + }, + { + "epoch": 1.1657642136352449, + "grad_norm": 0.8831890225410461, + "learning_rate": 4.570464064698289e-06, + "loss": 0.6522, + "step": 3158 + }, + { + "epoch": 1.166133398966281, + "grad_norm": 0.8792640566825867, + "learning_rate": 4.570191434818097e-06, + "loss": 0.6425, + "step": 3159 + }, + { + "epoch": 1.1665025842973173, + "grad_norm": 0.8846005797386169, + "learning_rate": 4.569918726581814e-06, + "loss": 0.6874, + "step": 3160 + }, + { + "epoch": 1.1668717696283535, + "grad_norm": 0.8826996684074402, + "learning_rate": 4.569645939999759e-06, + "loss": 0.6596, + "step": 3161 + }, + { + "epoch": 1.1672409549593896, + "grad_norm": 0.9217683672904968, + "learning_rate": 4.569373075082259e-06, + "loss": 0.6647, + "step": 3162 + }, + { + "epoch": 1.1676101402904258, + "grad_norm": 0.8851909041404724, + "learning_rate": 4.569100131839642e-06, + "loss": 0.6556, + "step": 3163 + }, + { + "epoch": 1.167979325621462, + "grad_norm": 0.8815199136734009, + "learning_rate": 4.568827110282238e-06, + "loss": 0.6661, + "step": 3164 + }, + { + "epoch": 1.168348510952498, + "grad_norm": 0.8823119401931763, + "learning_rate": 4.56855401042038e-06, + "loss": 0.6319, + "step": 3165 + }, + { + "epoch": 1.1687176962835344, + "grad_norm": 0.8332284688949585, + "learning_rate": 4.568280832264406e-06, + "loss": 0.6772, + "step": 3166 + }, + { + "epoch": 1.1690868816145705, + "grad_norm": 0.8845404982566833, + "learning_rate": 4.568007575824655e-06, + "loss": 0.6557, + "step": 3167 + }, + { + "epoch": 1.1694560669456067, + "grad_norm": 0.8850431442260742, + "learning_rate": 4.567734241111471e-06, + "loss": 0.6904, + "step": 3168 + }, + { + "epoch": 1.1698252522766428, + "grad_norm": 0.8625415563583374, + "learning_rate": 4.567460828135197e-06, + "loss": 0.6904, + "step": 3169 + }, + { + "epoch": 1.170194437607679, + "grad_norm": 0.8857234716415405, + "learning_rate": 4.5671873369061845e-06, + "loss": 0.6489, + "step": 3170 + }, + { + "epoch": 1.1705636229387153, + "grad_norm": 0.9007185697555542, + "learning_rate": 4.566913767434783e-06, + "loss": 0.669, + "step": 3171 + }, + { + "epoch": 1.1709328082697514, + "grad_norm": 0.9015816450119019, + "learning_rate": 4.566640119731347e-06, + "loss": 0.6996, + "step": 3172 + }, + { + "epoch": 1.1713019936007876, + "grad_norm": 0.9044726490974426, + "learning_rate": 4.566366393806236e-06, + "loss": 0.6701, + "step": 3173 + }, + { + "epoch": 1.1716711789318237, + "grad_norm": 0.8698941469192505, + "learning_rate": 4.5660925896698086e-06, + "loss": 0.6608, + "step": 3174 + }, + { + "epoch": 1.1720403642628598, + "grad_norm": 0.8741365075111389, + "learning_rate": 4.565818707332429e-06, + "loss": 0.6309, + "step": 3175 + }, + { + "epoch": 1.1724095495938962, + "grad_norm": 0.8679743409156799, + "learning_rate": 4.565544746804463e-06, + "loss": 0.6472, + "step": 3176 + }, + { + "epoch": 1.1727787349249323, + "grad_norm": 0.906029999256134, + "learning_rate": 4.56527070809628e-06, + "loss": 0.6783, + "step": 3177 + }, + { + "epoch": 1.1731479202559685, + "grad_norm": 0.8691981434822083, + "learning_rate": 4.564996591218253e-06, + "loss": 0.6668, + "step": 3178 + }, + { + "epoch": 1.1735171055870046, + "grad_norm": 0.9207571148872375, + "learning_rate": 4.564722396180756e-06, + "loss": 0.6724, + "step": 3179 + }, + { + "epoch": 1.1738862909180408, + "grad_norm": 0.910437285900116, + "learning_rate": 4.564448122994169e-06, + "loss": 0.6347, + "step": 3180 + }, + { + "epoch": 1.1742554762490771, + "grad_norm": 0.8458153009414673, + "learning_rate": 4.5641737716688715e-06, + "loss": 0.624, + "step": 3181 + }, + { + "epoch": 1.1746246615801132, + "grad_norm": 0.8635268211364746, + "learning_rate": 4.563899342215248e-06, + "loss": 0.6675, + "step": 3182 + }, + { + "epoch": 1.1749938469111494, + "grad_norm": 0.863741934299469, + "learning_rate": 4.5636248346436865e-06, + "loss": 0.6318, + "step": 3183 + }, + { + "epoch": 1.1753630322421855, + "grad_norm": 0.8732617497444153, + "learning_rate": 4.563350248964576e-06, + "loss": 0.6586, + "step": 3184 + }, + { + "epoch": 1.1757322175732217, + "grad_norm": 0.8922687768936157, + "learning_rate": 4.563075585188309e-06, + "loss": 0.6829, + "step": 3185 + }, + { + "epoch": 1.176101402904258, + "grad_norm": 0.88872230052948, + "learning_rate": 4.562800843325284e-06, + "loss": 0.6623, + "step": 3186 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.8957573771476746, + "learning_rate": 4.562526023385896e-06, + "loss": 0.6843, + "step": 3187 + }, + { + "epoch": 1.1768397735663303, + "grad_norm": 0.907209038734436, + "learning_rate": 4.56225112538055e-06, + "loss": 0.6395, + "step": 3188 + }, + { + "epoch": 1.1772089588973664, + "grad_norm": 0.881430983543396, + "learning_rate": 4.56197614931965e-06, + "loss": 0.6812, + "step": 3189 + }, + { + "epoch": 1.1775781442284026, + "grad_norm": 0.918123185634613, + "learning_rate": 4.561701095213603e-06, + "loss": 0.6957, + "step": 3190 + }, + { + "epoch": 1.177947329559439, + "grad_norm": 0.896618664264679, + "learning_rate": 4.561425963072821e-06, + "loss": 0.6507, + "step": 3191 + }, + { + "epoch": 1.178316514890475, + "grad_norm": 0.8513078689575195, + "learning_rate": 4.561150752907716e-06, + "loss": 0.6857, + "step": 3192 + }, + { + "epoch": 1.1786857002215112, + "grad_norm": 0.9343494176864624, + "learning_rate": 4.560875464728706e-06, + "loss": 0.6636, + "step": 3193 + }, + { + "epoch": 1.1790548855525473, + "grad_norm": 0.8371302485466003, + "learning_rate": 4.560600098546209e-06, + "loss": 0.6709, + "step": 3194 + }, + { + "epoch": 1.1794240708835835, + "grad_norm": 0.8666498064994812, + "learning_rate": 4.560324654370651e-06, + "loss": 0.6775, + "step": 3195 + }, + { + "epoch": 1.1797932562146198, + "grad_norm": 0.905321478843689, + "learning_rate": 4.560049132212453e-06, + "loss": 0.6866, + "step": 3196 + }, + { + "epoch": 1.180162441545656, + "grad_norm": 0.8645988702774048, + "learning_rate": 4.559773532082045e-06, + "loss": 0.6633, + "step": 3197 + }, + { + "epoch": 1.180531626876692, + "grad_norm": 0.9116607308387756, + "learning_rate": 4.559497853989861e-06, + "loss": 0.6708, + "step": 3198 + }, + { + "epoch": 1.1809008122077282, + "grad_norm": 0.872043788433075, + "learning_rate": 4.559222097946333e-06, + "loss": 0.7004, + "step": 3199 + }, + { + "epoch": 1.1812699975387644, + "grad_norm": 0.857740044593811, + "learning_rate": 4.558946263961898e-06, + "loss": 0.6511, + "step": 3200 + }, + { + "epoch": 1.1816391828698007, + "grad_norm": 0.8758658170700073, + "learning_rate": 4.558670352046995e-06, + "loss": 0.6625, + "step": 3201 + }, + { + "epoch": 1.1820083682008369, + "grad_norm": 0.886246919631958, + "learning_rate": 4.558394362212071e-06, + "loss": 0.6705, + "step": 3202 + }, + { + "epoch": 1.182377553531873, + "grad_norm": 0.8909015655517578, + "learning_rate": 4.558118294467569e-06, + "loss": 0.6713, + "step": 3203 + }, + { + "epoch": 1.1827467388629092, + "grad_norm": 0.8617270588874817, + "learning_rate": 4.55784214882394e-06, + "loss": 0.6406, + "step": 3204 + }, + { + "epoch": 1.1831159241939453, + "grad_norm": 0.8917878866195679, + "learning_rate": 4.5575659252916345e-06, + "loss": 0.6876, + "step": 3205 + }, + { + "epoch": 1.1834851095249816, + "grad_norm": 0.8785607814788818, + "learning_rate": 4.557289623881108e-06, + "loss": 0.6736, + "step": 3206 + }, + { + "epoch": 1.1838542948560178, + "grad_norm": 0.8929194211959839, + "learning_rate": 4.557013244602819e-06, + "loss": 0.6576, + "step": 3207 + }, + { + "epoch": 1.184223480187054, + "grad_norm": 0.8644699454307556, + "learning_rate": 4.556736787467227e-06, + "loss": 0.6619, + "step": 3208 + }, + { + "epoch": 1.18459266551809, + "grad_norm": 0.9307174682617188, + "learning_rate": 4.556460252484798e-06, + "loss": 0.6588, + "step": 3209 + }, + { + "epoch": 1.1849618508491262, + "grad_norm": 0.8863493204116821, + "learning_rate": 4.556183639665997e-06, + "loss": 0.6881, + "step": 3210 + }, + { + "epoch": 1.1853310361801626, + "grad_norm": 0.8825064897537231, + "learning_rate": 4.555906949021294e-06, + "loss": 0.6827, + "step": 3211 + }, + { + "epoch": 1.1857002215111987, + "grad_norm": 0.9005194306373596, + "learning_rate": 4.555630180561162e-06, + "loss": 0.6337, + "step": 3212 + }, + { + "epoch": 1.1860694068422348, + "grad_norm": 0.8814252614974976, + "learning_rate": 4.555353334296075e-06, + "loss": 0.6365, + "step": 3213 + }, + { + "epoch": 1.186438592173271, + "grad_norm": 0.9113121628761292, + "learning_rate": 4.555076410236515e-06, + "loss": 0.6704, + "step": 3214 + }, + { + "epoch": 1.186807777504307, + "grad_norm": 0.8848028182983398, + "learning_rate": 4.554799408392961e-06, + "loss": 0.6786, + "step": 3215 + }, + { + "epoch": 1.1871769628353435, + "grad_norm": 0.8660136461257935, + "learning_rate": 4.554522328775898e-06, + "loss": 0.6712, + "step": 3216 + }, + { + "epoch": 1.1875461481663796, + "grad_norm": 0.8836926221847534, + "learning_rate": 4.554245171395813e-06, + "loss": 0.643, + "step": 3217 + }, + { + "epoch": 1.1879153334974157, + "grad_norm": 0.8380313515663147, + "learning_rate": 4.553967936263196e-06, + "loss": 0.6446, + "step": 3218 + }, + { + "epoch": 1.1882845188284519, + "grad_norm": 0.8486880660057068, + "learning_rate": 4.553690623388541e-06, + "loss": 0.6608, + "step": 3219 + }, + { + "epoch": 1.188653704159488, + "grad_norm": 0.8689664602279663, + "learning_rate": 4.553413232782344e-06, + "loss": 0.6349, + "step": 3220 + }, + { + "epoch": 1.1890228894905241, + "grad_norm": 0.8685415983200073, + "learning_rate": 4.553135764455105e-06, + "loss": 0.6867, + "step": 3221 + }, + { + "epoch": 1.1893920748215605, + "grad_norm": 0.8657764792442322, + "learning_rate": 4.552858218417325e-06, + "loss": 0.6816, + "step": 3222 + }, + { + "epoch": 1.1897612601525966, + "grad_norm": 0.9072649478912354, + "learning_rate": 4.552580594679508e-06, + "loss": 0.6507, + "step": 3223 + }, + { + "epoch": 1.1901304454836328, + "grad_norm": 0.872866153717041, + "learning_rate": 4.552302893252166e-06, + "loss": 0.689, + "step": 3224 + }, + { + "epoch": 1.190499630814669, + "grad_norm": 0.9205130338668823, + "learning_rate": 4.552025114145805e-06, + "loss": 0.6491, + "step": 3225 + }, + { + "epoch": 1.190868816145705, + "grad_norm": 0.8630295395851135, + "learning_rate": 4.551747257370942e-06, + "loss": 0.6404, + "step": 3226 + }, + { + "epoch": 1.1912380014767414, + "grad_norm": 0.8629109263420105, + "learning_rate": 4.551469322938092e-06, + "loss": 0.6752, + "step": 3227 + }, + { + "epoch": 1.1916071868077776, + "grad_norm": 0.9048233032226562, + "learning_rate": 4.5511913108577765e-06, + "loss": 0.6766, + "step": 3228 + }, + { + "epoch": 1.1919763721388137, + "grad_norm": 0.9175147414207458, + "learning_rate": 4.550913221140517e-06, + "loss": 0.6597, + "step": 3229 + }, + { + "epoch": 1.1923455574698498, + "grad_norm": 0.8732107281684875, + "learning_rate": 4.5506350537968396e-06, + "loss": 0.6198, + "step": 3230 + }, + { + "epoch": 1.192714742800886, + "grad_norm": 0.8889228105545044, + "learning_rate": 4.550356808837272e-06, + "loss": 0.6645, + "step": 3231 + }, + { + "epoch": 1.193083928131922, + "grad_norm": 0.8769485950469971, + "learning_rate": 4.5500784862723465e-06, + "loss": 0.6049, + "step": 3232 + }, + { + "epoch": 1.1934531134629585, + "grad_norm": 0.8569746017456055, + "learning_rate": 4.549800086112598e-06, + "loss": 0.683, + "step": 3233 + }, + { + "epoch": 1.1938222987939946, + "grad_norm": 0.9431794285774231, + "learning_rate": 4.5495216083685625e-06, + "loss": 0.6091, + "step": 3234 + }, + { + "epoch": 1.1941914841250307, + "grad_norm": 0.8798436522483826, + "learning_rate": 4.549243053050781e-06, + "loss": 0.7044, + "step": 3235 + }, + { + "epoch": 1.1945606694560669, + "grad_norm": 0.8860886096954346, + "learning_rate": 4.548964420169797e-06, + "loss": 0.6204, + "step": 3236 + }, + { + "epoch": 1.194929854787103, + "grad_norm": 0.862088680267334, + "learning_rate": 4.548685709736156e-06, + "loss": 0.6574, + "step": 3237 + }, + { + "epoch": 1.1952990401181394, + "grad_norm": 0.897233247756958, + "learning_rate": 4.548406921760407e-06, + "loss": 0.6308, + "step": 3238 + }, + { + "epoch": 1.1956682254491755, + "grad_norm": 0.919031023979187, + "learning_rate": 4.5481280562531034e-06, + "loss": 0.6724, + "step": 3239 + }, + { + "epoch": 1.1960374107802116, + "grad_norm": 0.9182099103927612, + "learning_rate": 4.5478491132248e-06, + "loss": 0.6778, + "step": 3240 + }, + { + "epoch": 1.1964065961112478, + "grad_norm": 0.8858866691589355, + "learning_rate": 4.5475700926860524e-06, + "loss": 0.6864, + "step": 3241 + }, + { + "epoch": 1.196775781442284, + "grad_norm": 0.932297945022583, + "learning_rate": 4.547290994647423e-06, + "loss": 0.6684, + "step": 3242 + }, + { + "epoch": 1.1971449667733203, + "grad_norm": 0.869464635848999, + "learning_rate": 4.547011819119476e-06, + "loss": 0.6938, + "step": 3243 + }, + { + "epoch": 1.1975141521043564, + "grad_norm": 0.8767672181129456, + "learning_rate": 4.546732566112778e-06, + "loss": 0.671, + "step": 3244 + }, + { + "epoch": 1.1978833374353925, + "grad_norm": 0.8646097183227539, + "learning_rate": 4.5464532356378974e-06, + "loss": 0.6331, + "step": 3245 + }, + { + "epoch": 1.1982525227664287, + "grad_norm": 0.8739516735076904, + "learning_rate": 4.546173827705409e-06, + "loss": 0.6844, + "step": 3246 + }, + { + "epoch": 1.1986217080974648, + "grad_norm": 0.8882744312286377, + "learning_rate": 4.545894342325886e-06, + "loss": 0.6706, + "step": 3247 + }, + { + "epoch": 1.1989908934285012, + "grad_norm": 0.8905485272407532, + "learning_rate": 4.545614779509908e-06, + "loss": 0.6739, + "step": 3248 + }, + { + "epoch": 1.1993600787595373, + "grad_norm": 0.8905662298202515, + "learning_rate": 4.545335139268057e-06, + "loss": 0.7226, + "step": 3249 + }, + { + "epoch": 1.1997292640905735, + "grad_norm": 0.9155777096748352, + "learning_rate": 4.5450554216109165e-06, + "loss": 0.6706, + "step": 3250 + }, + { + "epoch": 1.2000984494216096, + "grad_norm": 0.8896726965904236, + "learning_rate": 4.5447756265490726e-06, + "loss": 0.672, + "step": 3251 + }, + { + "epoch": 1.2004676347526457, + "grad_norm": 0.8563463687896729, + "learning_rate": 4.544495754093118e-06, + "loss": 0.6568, + "step": 3252 + }, + { + "epoch": 1.200836820083682, + "grad_norm": 0.8982632756233215, + "learning_rate": 4.544215804253643e-06, + "loss": 0.6987, + "step": 3253 + }, + { + "epoch": 1.2012060054147182, + "grad_norm": 0.9247127175331116, + "learning_rate": 4.543935777041245e-06, + "loss": 0.6814, + "step": 3254 + }, + { + "epoch": 1.2015751907457544, + "grad_norm": 0.8825064897537231, + "learning_rate": 4.543655672466524e-06, + "loss": 0.6746, + "step": 3255 + }, + { + "epoch": 1.2019443760767905, + "grad_norm": 0.882178544998169, + "learning_rate": 4.54337549054008e-06, + "loss": 0.6524, + "step": 3256 + }, + { + "epoch": 1.2023135614078266, + "grad_norm": 0.8762989044189453, + "learning_rate": 4.543095231272518e-06, + "loss": 0.6602, + "step": 3257 + }, + { + "epoch": 1.202682746738863, + "grad_norm": 0.9170951247215271, + "learning_rate": 4.542814894674447e-06, + "loss": 0.7022, + "step": 3258 + }, + { + "epoch": 1.2030519320698991, + "grad_norm": 0.9570855498313904, + "learning_rate": 4.542534480756478e-06, + "loss": 0.6701, + "step": 3259 + }, + { + "epoch": 1.2034211174009353, + "grad_norm": 0.9000840187072754, + "learning_rate": 4.542253989529223e-06, + "loss": 0.7123, + "step": 3260 + }, + { + "epoch": 1.2037903027319714, + "grad_norm": 0.9342755079269409, + "learning_rate": 4.541973421003298e-06, + "loss": 0.6813, + "step": 3261 + }, + { + "epoch": 1.2041594880630075, + "grad_norm": 0.8912753462791443, + "learning_rate": 4.541692775189324e-06, + "loss": 0.6698, + "step": 3262 + }, + { + "epoch": 1.204528673394044, + "grad_norm": 0.8646280169487, + "learning_rate": 4.541412052097923e-06, + "loss": 0.6684, + "step": 3263 + }, + { + "epoch": 1.20489785872508, + "grad_norm": 0.8513126373291016, + "learning_rate": 4.54113125173972e-06, + "loss": 0.6649, + "step": 3264 + }, + { + "epoch": 1.2052670440561162, + "grad_norm": 0.8239273428916931, + "learning_rate": 4.540850374125344e-06, + "loss": 0.6627, + "step": 3265 + }, + { + "epoch": 1.2056362293871523, + "grad_norm": 0.8332374095916748, + "learning_rate": 4.540569419265424e-06, + "loss": 0.6642, + "step": 3266 + }, + { + "epoch": 1.2060054147181885, + "grad_norm": 0.8597453236579895, + "learning_rate": 4.5402883871705965e-06, + "loss": 0.6522, + "step": 3267 + }, + { + "epoch": 1.2063746000492248, + "grad_norm": 0.8521862626075745, + "learning_rate": 4.5400072778514966e-06, + "loss": 0.6659, + "step": 3268 + }, + { + "epoch": 1.206743785380261, + "grad_norm": 0.850471556186676, + "learning_rate": 4.539726091318767e-06, + "loss": 0.6519, + "step": 3269 + }, + { + "epoch": 1.207112970711297, + "grad_norm": 0.8755480647087097, + "learning_rate": 4.539444827583047e-06, + "loss": 0.6604, + "step": 3270 + }, + { + "epoch": 1.2074821560423332, + "grad_norm": 0.8768699169158936, + "learning_rate": 4.539163486654984e-06, + "loss": 0.6919, + "step": 3271 + }, + { + "epoch": 1.2078513413733694, + "grad_norm": 0.8651711940765381, + "learning_rate": 4.538882068545227e-06, + "loss": 0.6323, + "step": 3272 + }, + { + "epoch": 1.2082205267044057, + "grad_norm": 0.8675568103790283, + "learning_rate": 4.538600573264427e-06, + "loss": 0.666, + "step": 3273 + }, + { + "epoch": 1.2085897120354419, + "grad_norm": 0.8694093823432922, + "learning_rate": 4.538319000823239e-06, + "loss": 0.6968, + "step": 3274 + }, + { + "epoch": 1.208958897366478, + "grad_norm": 0.9025878310203552, + "learning_rate": 4.538037351232319e-06, + "loss": 0.6351, + "step": 3275 + }, + { + "epoch": 1.2093280826975141, + "grad_norm": 0.8760859966278076, + "learning_rate": 4.5377556245023295e-06, + "loss": 0.665, + "step": 3276 + }, + { + "epoch": 1.2096972680285503, + "grad_norm": 0.8768151998519897, + "learning_rate": 4.537473820643933e-06, + "loss": 0.6504, + "step": 3277 + }, + { + "epoch": 1.2100664533595866, + "grad_norm": 0.8331866264343262, + "learning_rate": 4.537191939667794e-06, + "loss": 0.6638, + "step": 3278 + }, + { + "epoch": 1.2104356386906228, + "grad_norm": 0.8496575355529785, + "learning_rate": 4.536909981584585e-06, + "loss": 0.6949, + "step": 3279 + }, + { + "epoch": 1.210804824021659, + "grad_norm": 0.8720790147781372, + "learning_rate": 4.536627946404974e-06, + "loss": 0.6919, + "step": 3280 + }, + { + "epoch": 1.211174009352695, + "grad_norm": 0.8877255320549011, + "learning_rate": 4.53634583413964e-06, + "loss": 0.6922, + "step": 3281 + }, + { + "epoch": 1.2115431946837312, + "grad_norm": 0.8780970573425293, + "learning_rate": 4.536063644799258e-06, + "loss": 0.6929, + "step": 3282 + }, + { + "epoch": 1.2119123800147675, + "grad_norm": 0.8866240978240967, + "learning_rate": 4.53578137839451e-06, + "loss": 0.6707, + "step": 3283 + }, + { + "epoch": 1.2122815653458037, + "grad_norm": 0.8466463685035706, + "learning_rate": 4.535499034936079e-06, + "loss": 0.701, + "step": 3284 + }, + { + "epoch": 1.2126507506768398, + "grad_norm": 0.890760064125061, + "learning_rate": 4.535216614434651e-06, + "loss": 0.6847, + "step": 3285 + }, + { + "epoch": 1.213019936007876, + "grad_norm": 0.8815742135047913, + "learning_rate": 4.534934116900918e-06, + "loss": 0.693, + "step": 3286 + }, + { + "epoch": 1.213389121338912, + "grad_norm": 0.8746995329856873, + "learning_rate": 4.53465154234557e-06, + "loss": 0.666, + "step": 3287 + }, + { + "epoch": 1.2137583066699482, + "grad_norm": 0.875745415687561, + "learning_rate": 4.534368890779303e-06, + "loss": 0.6651, + "step": 3288 + }, + { + "epoch": 1.2141274920009846, + "grad_norm": 0.8745843768119812, + "learning_rate": 4.534086162212816e-06, + "loss": 0.6559, + "step": 3289 + }, + { + "epoch": 1.2144966773320207, + "grad_norm": 0.8745847344398499, + "learning_rate": 4.533803356656809e-06, + "loss": 0.6737, + "step": 3290 + }, + { + "epoch": 1.2148658626630569, + "grad_norm": 0.9050351977348328, + "learning_rate": 4.533520474121988e-06, + "loss": 0.6682, + "step": 3291 + }, + { + "epoch": 1.215235047994093, + "grad_norm": 0.8686885833740234, + "learning_rate": 4.533237514619059e-06, + "loss": 0.6848, + "step": 3292 + }, + { + "epoch": 1.2156042333251291, + "grad_norm": 0.8773931264877319, + "learning_rate": 4.532954478158731e-06, + "loss": 0.6646, + "step": 3293 + }, + { + "epoch": 1.2159734186561655, + "grad_norm": 0.9115171432495117, + "learning_rate": 4.532671364751718e-06, + "loss": 0.6737, + "step": 3294 + }, + { + "epoch": 1.2163426039872016, + "grad_norm": 0.8707975745201111, + "learning_rate": 4.532388174408735e-06, + "loss": 0.6889, + "step": 3295 + }, + { + "epoch": 1.2167117893182378, + "grad_norm": 0.8771401047706604, + "learning_rate": 4.532104907140501e-06, + "loss": 0.6561, + "step": 3296 + }, + { + "epoch": 1.217080974649274, + "grad_norm": 0.8906264305114746, + "learning_rate": 4.531821562957738e-06, + "loss": 0.6913, + "step": 3297 + }, + { + "epoch": 1.21745015998031, + "grad_norm": 0.9083592891693115, + "learning_rate": 4.53153814187117e-06, + "loss": 0.6429, + "step": 3298 + }, + { + "epoch": 1.2178193453113464, + "grad_norm": 0.8643853664398193, + "learning_rate": 4.531254643891524e-06, + "loss": 0.683, + "step": 3299 + }, + { + "epoch": 1.2181885306423825, + "grad_norm": 0.876392126083374, + "learning_rate": 4.530971069029531e-06, + "loss": 0.6643, + "step": 3300 + }, + { + "epoch": 1.2185577159734187, + "grad_norm": 0.863879919052124, + "learning_rate": 4.530687417295925e-06, + "loss": 0.6584, + "step": 3301 + }, + { + "epoch": 1.2189269013044548, + "grad_norm": 0.8785788416862488, + "learning_rate": 4.53040368870144e-06, + "loss": 0.6571, + "step": 3302 + }, + { + "epoch": 1.219296086635491, + "grad_norm": 0.9143131375312805, + "learning_rate": 4.5301198832568164e-06, + "loss": 0.6833, + "step": 3303 + }, + { + "epoch": 1.219665271966527, + "grad_norm": 0.8727525472640991, + "learning_rate": 4.529836000972796e-06, + "loss": 0.6579, + "step": 3304 + }, + { + "epoch": 1.2200344572975634, + "grad_norm": 0.860600471496582, + "learning_rate": 4.529552041860124e-06, + "loss": 0.6474, + "step": 3305 + }, + { + "epoch": 1.2204036426285996, + "grad_norm": 0.8805179595947266, + "learning_rate": 4.5292680059295465e-06, + "loss": 0.6789, + "step": 3306 + }, + { + "epoch": 1.2207728279596357, + "grad_norm": 0.8818009495735168, + "learning_rate": 4.5289838931918175e-06, + "loss": 0.6173, + "step": 3307 + }, + { + "epoch": 1.2211420132906718, + "grad_norm": 0.876557469367981, + "learning_rate": 4.528699703657687e-06, + "loss": 0.6661, + "step": 3308 + }, + { + "epoch": 1.221511198621708, + "grad_norm": 0.9014097452163696, + "learning_rate": 4.528415437337913e-06, + "loss": 0.6549, + "step": 3309 + }, + { + "epoch": 1.2218803839527443, + "grad_norm": 0.8479263782501221, + "learning_rate": 4.528131094243255e-06, + "loss": 0.6581, + "step": 3310 + }, + { + "epoch": 1.2222495692837805, + "grad_norm": 0.8747854828834534, + "learning_rate": 4.527846674384476e-06, + "loss": 0.6762, + "step": 3311 + }, + { + "epoch": 1.2226187546148166, + "grad_norm": 0.877360463142395, + "learning_rate": 4.52756217777234e-06, + "loss": 0.6557, + "step": 3312 + }, + { + "epoch": 1.2229879399458528, + "grad_norm": 0.8572728037834167, + "learning_rate": 4.527277604417615e-06, + "loss": 0.6559, + "step": 3313 + }, + { + "epoch": 1.223357125276889, + "grad_norm": 0.8739327788352966, + "learning_rate": 4.526992954331073e-06, + "loss": 0.677, + "step": 3314 + }, + { + "epoch": 1.2237263106079252, + "grad_norm": 0.8547506928443909, + "learning_rate": 4.526708227523486e-06, + "loss": 0.6384, + "step": 3315 + }, + { + "epoch": 1.2240954959389614, + "grad_norm": 0.9073244333267212, + "learning_rate": 4.5264234240056334e-06, + "loss": 0.67, + "step": 3316 + }, + { + "epoch": 1.2244646812699975, + "grad_norm": 0.8969650268554688, + "learning_rate": 4.526138543788294e-06, + "loss": 0.6829, + "step": 3317 + }, + { + "epoch": 1.2248338666010337, + "grad_norm": 0.9018692970275879, + "learning_rate": 4.52585358688225e-06, + "loss": 0.6597, + "step": 3318 + }, + { + "epoch": 1.2252030519320698, + "grad_norm": 0.8825366497039795, + "learning_rate": 4.525568553298287e-06, + "loss": 0.6429, + "step": 3319 + }, + { + "epoch": 1.2255722372631062, + "grad_norm": 0.8608793616294861, + "learning_rate": 4.525283443047194e-06, + "loss": 0.6551, + "step": 3320 + }, + { + "epoch": 1.2259414225941423, + "grad_norm": 0.8649479746818542, + "learning_rate": 4.524998256139761e-06, + "loss": 0.6498, + "step": 3321 + }, + { + "epoch": 1.2263106079251784, + "grad_norm": 0.8978915810585022, + "learning_rate": 4.5247129925867835e-06, + "loss": 0.6833, + "step": 3322 + }, + { + "epoch": 1.2266797932562146, + "grad_norm": 0.8771827220916748, + "learning_rate": 4.524427652399059e-06, + "loss": 0.6508, + "step": 3323 + }, + { + "epoch": 1.2270489785872507, + "grad_norm": 0.8857488632202148, + "learning_rate": 4.524142235587386e-06, + "loss": 0.68, + "step": 3324 + }, + { + "epoch": 1.227418163918287, + "grad_norm": 0.9005944132804871, + "learning_rate": 4.523856742162568e-06, + "loss": 0.6591, + "step": 3325 + }, + { + "epoch": 1.2277873492493232, + "grad_norm": 0.8886708617210388, + "learning_rate": 4.523571172135411e-06, + "loss": 0.6432, + "step": 3326 + }, + { + "epoch": 1.2281565345803593, + "grad_norm": 0.8557465672492981, + "learning_rate": 4.523285525516724e-06, + "loss": 0.7078, + "step": 3327 + }, + { + "epoch": 1.2285257199113955, + "grad_norm": 0.8991360664367676, + "learning_rate": 4.5229998023173185e-06, + "loss": 0.6221, + "step": 3328 + }, + { + "epoch": 1.2288949052424316, + "grad_norm": 0.8624705076217651, + "learning_rate": 4.522714002548009e-06, + "loss": 0.6642, + "step": 3329 + }, + { + "epoch": 1.229264090573468, + "grad_norm": 0.8681010007858276, + "learning_rate": 4.522428126219613e-06, + "loss": 0.7066, + "step": 3330 + }, + { + "epoch": 1.229633275904504, + "grad_norm": 0.8717160224914551, + "learning_rate": 4.522142173342951e-06, + "loss": 0.663, + "step": 3331 + }, + { + "epoch": 1.2300024612355402, + "grad_norm": 0.8538660407066345, + "learning_rate": 4.521856143928844e-06, + "loss": 0.6669, + "step": 3332 + }, + { + "epoch": 1.2303716465665764, + "grad_norm": 0.8503022193908691, + "learning_rate": 4.521570037988122e-06, + "loss": 0.6494, + "step": 3333 + }, + { + "epoch": 1.2307408318976125, + "grad_norm": 0.8975503444671631, + "learning_rate": 4.521283855531612e-06, + "loss": 0.6892, + "step": 3334 + }, + { + "epoch": 1.2311100172286489, + "grad_norm": 0.8300214409828186, + "learning_rate": 4.520997596570145e-06, + "loss": 0.621, + "step": 3335 + }, + { + "epoch": 1.231479202559685, + "grad_norm": 0.9026924967765808, + "learning_rate": 4.520711261114558e-06, + "loss": 0.6844, + "step": 3336 + }, + { + "epoch": 1.2318483878907212, + "grad_norm": 0.855913519859314, + "learning_rate": 4.520424849175686e-06, + "loss": 0.6337, + "step": 3337 + }, + { + "epoch": 1.2322175732217573, + "grad_norm": 0.8346027731895447, + "learning_rate": 4.520138360764372e-06, + "loss": 0.6493, + "step": 3338 + }, + { + "epoch": 1.2325867585527934, + "grad_norm": 0.9031023383140564, + "learning_rate": 4.519851795891459e-06, + "loss": 0.6579, + "step": 3339 + }, + { + "epoch": 1.2329559438838298, + "grad_norm": 0.8674537539482117, + "learning_rate": 4.519565154567793e-06, + "loss": 0.6511, + "step": 3340 + }, + { + "epoch": 1.233325129214866, + "grad_norm": 0.8472675681114197, + "learning_rate": 4.519278436804223e-06, + "loss": 0.6527, + "step": 3341 + }, + { + "epoch": 1.233694314545902, + "grad_norm": 0.8838043808937073, + "learning_rate": 4.518991642611601e-06, + "loss": 0.6669, + "step": 3342 + }, + { + "epoch": 1.2340634998769382, + "grad_norm": 0.8990933299064636, + "learning_rate": 4.518704772000782e-06, + "loss": 0.6533, + "step": 3343 + }, + { + "epoch": 1.2344326852079743, + "grad_norm": 0.886562705039978, + "learning_rate": 4.518417824982626e-06, + "loss": 0.6715, + "step": 3344 + }, + { + "epoch": 1.2348018705390107, + "grad_norm": 0.8894440531730652, + "learning_rate": 4.518130801567992e-06, + "loss": 0.6906, + "step": 3345 + }, + { + "epoch": 1.2351710558700468, + "grad_norm": 0.8636101484298706, + "learning_rate": 4.517843701767743e-06, + "loss": 0.6357, + "step": 3346 + }, + { + "epoch": 1.235540241201083, + "grad_norm": 0.844943642616272, + "learning_rate": 4.517556525592748e-06, + "loss": 0.6355, + "step": 3347 + }, + { + "epoch": 1.235909426532119, + "grad_norm": 0.8782294988632202, + "learning_rate": 4.517269273053875e-06, + "loss": 0.6649, + "step": 3348 + }, + { + "epoch": 1.2362786118631552, + "grad_norm": 0.9088314771652222, + "learning_rate": 4.516981944161997e-06, + "loss": 0.6701, + "step": 3349 + }, + { + "epoch": 1.2366477971941916, + "grad_norm": 0.8731002807617188, + "learning_rate": 4.516694538927988e-06, + "loss": 0.6878, + "step": 3350 + }, + { + "epoch": 1.2370169825252277, + "grad_norm": 0.8676177859306335, + "learning_rate": 4.516407057362728e-06, + "loss": 0.6592, + "step": 3351 + }, + { + "epoch": 1.2373861678562639, + "grad_norm": 0.8963800072669983, + "learning_rate": 4.516119499477097e-06, + "loss": 0.6722, + "step": 3352 + }, + { + "epoch": 1.2377553531873, + "grad_norm": 0.8648866415023804, + "learning_rate": 4.515831865281979e-06, + "loss": 0.6905, + "step": 3353 + }, + { + "epoch": 1.2381245385183361, + "grad_norm": 0.917116641998291, + "learning_rate": 4.515544154788261e-06, + "loss": 0.6682, + "step": 3354 + }, + { + "epoch": 1.2384937238493725, + "grad_norm": 0.8611881732940674, + "learning_rate": 4.515256368006834e-06, + "loss": 0.6903, + "step": 3355 + }, + { + "epoch": 1.2388629091804086, + "grad_norm": 0.8889239430427551, + "learning_rate": 4.514968504948588e-06, + "loss": 0.662, + "step": 3356 + }, + { + "epoch": 1.2392320945114448, + "grad_norm": 0.8633840680122375, + "learning_rate": 4.514680565624421e-06, + "loss": 0.6627, + "step": 3357 + }, + { + "epoch": 1.239601279842481, + "grad_norm": 0.9041558504104614, + "learning_rate": 4.51439255004523e-06, + "loss": 0.6753, + "step": 3358 + }, + { + "epoch": 1.239970465173517, + "grad_norm": 0.892274796962738, + "learning_rate": 4.514104458221916e-06, + "loss": 0.6696, + "step": 3359 + }, + { + "epoch": 1.2403396505045532, + "grad_norm": 0.9020706415176392, + "learning_rate": 4.513816290165385e-06, + "loss": 0.6383, + "step": 3360 + }, + { + "epoch": 1.2407088358355896, + "grad_norm": 0.897588312625885, + "learning_rate": 4.513528045886543e-06, + "loss": 0.6405, + "step": 3361 + }, + { + "epoch": 1.2410780211666257, + "grad_norm": 0.8380104303359985, + "learning_rate": 4.5132397253963e-06, + "loss": 0.6782, + "step": 3362 + }, + { + "epoch": 1.2414472064976618, + "grad_norm": 0.8706375360488892, + "learning_rate": 4.5129513287055686e-06, + "loss": 0.6785, + "step": 3363 + }, + { + "epoch": 1.241816391828698, + "grad_norm": 0.8588371276855469, + "learning_rate": 4.512662855825264e-06, + "loss": 0.6528, + "step": 3364 + }, + { + "epoch": 1.242185577159734, + "grad_norm": 0.8973999619483948, + "learning_rate": 4.5123743067663064e-06, + "loss": 0.6568, + "step": 3365 + }, + { + "epoch": 1.2425547624907705, + "grad_norm": 0.8993077874183655, + "learning_rate": 4.5120856815396155e-06, + "loss": 0.6791, + "step": 3366 + }, + { + "epoch": 1.2429239478218066, + "grad_norm": 0.8929685354232788, + "learning_rate": 4.511796980156118e-06, + "loss": 0.6074, + "step": 3367 + }, + { + "epoch": 1.2432931331528427, + "grad_norm": 0.8352000713348389, + "learning_rate": 4.511508202626738e-06, + "loss": 0.6448, + "step": 3368 + }, + { + "epoch": 1.2436623184838789, + "grad_norm": 0.8644148707389832, + "learning_rate": 4.5112193489624086e-06, + "loss": 0.6988, + "step": 3369 + }, + { + "epoch": 1.244031503814915, + "grad_norm": 0.8869681358337402, + "learning_rate": 4.510930419174061e-06, + "loss": 0.66, + "step": 3370 + }, + { + "epoch": 1.2444006891459511, + "grad_norm": 0.906066358089447, + "learning_rate": 4.510641413272632e-06, + "loss": 0.6858, + "step": 3371 + }, + { + "epoch": 1.2447698744769875, + "grad_norm": 0.8813003301620483, + "learning_rate": 4.51035233126906e-06, + "loss": 0.6373, + "step": 3372 + }, + { + "epoch": 1.2451390598080236, + "grad_norm": 0.8385671377182007, + "learning_rate": 4.5100631731742876e-06, + "loss": 0.6498, + "step": 3373 + }, + { + "epoch": 1.2455082451390598, + "grad_norm": 0.8473148941993713, + "learning_rate": 4.509773938999257e-06, + "loss": 0.6291, + "step": 3374 + }, + { + "epoch": 1.245877430470096, + "grad_norm": 0.8513836860656738, + "learning_rate": 4.509484628754918e-06, + "loss": 0.6329, + "step": 3375 + }, + { + "epoch": 1.246246615801132, + "grad_norm": 0.8585875630378723, + "learning_rate": 4.509195242452219e-06, + "loss": 0.6615, + "step": 3376 + }, + { + "epoch": 1.2466158011321684, + "grad_norm": 0.8714626431465149, + "learning_rate": 4.508905780102115e-06, + "loss": 0.6614, + "step": 3377 + }, + { + "epoch": 1.2469849864632045, + "grad_norm": 0.8655300736427307, + "learning_rate": 4.508616241715562e-06, + "loss": 0.6529, + "step": 3378 + }, + { + "epoch": 1.2473541717942407, + "grad_norm": 0.8453882932662964, + "learning_rate": 4.508326627303517e-06, + "loss": 0.677, + "step": 3379 + }, + { + "epoch": 1.2477233571252768, + "grad_norm": 0.8810684680938721, + "learning_rate": 4.5080369368769425e-06, + "loss": 0.6305, + "step": 3380 + }, + { + "epoch": 1.248092542456313, + "grad_norm": 0.8423178195953369, + "learning_rate": 4.5077471704468055e-06, + "loss": 0.7077, + "step": 3381 + }, + { + "epoch": 1.2484617277873493, + "grad_norm": 0.9387530088424683, + "learning_rate": 4.507457328024071e-06, + "loss": 0.6984, + "step": 3382 + }, + { + "epoch": 1.2488309131183855, + "grad_norm": 0.8406319618225098, + "learning_rate": 4.507167409619709e-06, + "loss": 0.6489, + "step": 3383 + }, + { + "epoch": 1.2492000984494216, + "grad_norm": 0.9049533009529114, + "learning_rate": 4.506877415244695e-06, + "loss": 0.6168, + "step": 3384 + }, + { + "epoch": 1.2495692837804577, + "grad_norm": 0.8738078474998474, + "learning_rate": 4.506587344910005e-06, + "loss": 0.6291, + "step": 3385 + }, + { + "epoch": 1.2499384691114939, + "grad_norm": 0.8679043650627136, + "learning_rate": 4.506297198626616e-06, + "loss": 0.6466, + "step": 3386 + }, + { + "epoch": 1.2503076544425302, + "grad_norm": 0.8809841871261597, + "learning_rate": 4.5060069764055116e-06, + "loss": 0.6625, + "step": 3387 + }, + { + "epoch": 1.2506768397735664, + "grad_norm": 0.8762892484664917, + "learning_rate": 4.505716678257676e-06, + "loss": 0.6614, + "step": 3388 + }, + { + "epoch": 1.2510460251046025, + "grad_norm": 0.8782838582992554, + "learning_rate": 4.505426304194098e-06, + "loss": 0.6637, + "step": 3389 + }, + { + "epoch": 1.2514152104356386, + "grad_norm": 0.877668559551239, + "learning_rate": 4.5051358542257674e-06, + "loss": 0.6743, + "step": 3390 + }, + { + "epoch": 1.2517843957666748, + "grad_norm": 0.8830326795578003, + "learning_rate": 4.504845328363676e-06, + "loss": 0.6729, + "step": 3391 + }, + { + "epoch": 1.2521535810977111, + "grad_norm": 0.8492071032524109, + "learning_rate": 4.504554726618823e-06, + "loss": 0.6977, + "step": 3392 + }, + { + "epoch": 1.2525227664287473, + "grad_norm": 0.8964203596115112, + "learning_rate": 4.504264049002206e-06, + "loss": 0.6726, + "step": 3393 + }, + { + "epoch": 1.2528919517597834, + "grad_norm": 0.8635320663452148, + "learning_rate": 4.503973295524827e-06, + "loss": 0.6547, + "step": 3394 + }, + { + "epoch": 1.2532611370908195, + "grad_norm": 0.8662384152412415, + "learning_rate": 4.503682466197692e-06, + "loss": 0.6823, + "step": 3395 + }, + { + "epoch": 1.2536303224218557, + "grad_norm": 0.8868409395217896, + "learning_rate": 4.503391561031807e-06, + "loss": 0.6444, + "step": 3396 + }, + { + "epoch": 1.253999507752892, + "grad_norm": 0.8553755283355713, + "learning_rate": 4.503100580038184e-06, + "loss": 0.6743, + "step": 3397 + }, + { + "epoch": 1.2543686930839282, + "grad_norm": 0.8763161897659302, + "learning_rate": 4.502809523227837e-06, + "loss": 0.6944, + "step": 3398 + }, + { + "epoch": 1.2547378784149643, + "grad_norm": 0.8587225675582886, + "learning_rate": 4.50251839061178e-06, + "loss": 0.6321, + "step": 3399 + }, + { + "epoch": 1.2551070637460005, + "grad_norm": 0.8492217063903809, + "learning_rate": 4.502227182201035e-06, + "loss": 0.6116, + "step": 3400 + }, + { + "epoch": 1.2554762490770366, + "grad_norm": 0.8804566264152527, + "learning_rate": 4.501935898006623e-06, + "loss": 0.6579, + "step": 3401 + }, + { + "epoch": 1.255845434408073, + "grad_norm": 0.8827697038650513, + "learning_rate": 4.501644538039569e-06, + "loss": 0.6553, + "step": 3402 + }, + { + "epoch": 1.256214619739109, + "grad_norm": 0.8854148387908936, + "learning_rate": 4.501353102310901e-06, + "loss": 0.6857, + "step": 3403 + }, + { + "epoch": 1.2565838050701452, + "grad_norm": 0.8799946904182434, + "learning_rate": 4.50106159083165e-06, + "loss": 0.6523, + "step": 3404 + }, + { + "epoch": 1.2569529904011814, + "grad_norm": 0.865587055683136, + "learning_rate": 4.500770003612849e-06, + "loss": 0.6589, + "step": 3405 + }, + { + "epoch": 1.2573221757322175, + "grad_norm": 0.8719598650932312, + "learning_rate": 4.500478340665535e-06, + "loss": 0.6904, + "step": 3406 + }, + { + "epoch": 1.2576913610632539, + "grad_norm": 0.8985024094581604, + "learning_rate": 4.500186602000747e-06, + "loss": 0.6564, + "step": 3407 + }, + { + "epoch": 1.25806054639429, + "grad_norm": 0.8947860598564148, + "learning_rate": 4.499894787629527e-06, + "loss": 0.6367, + "step": 3408 + }, + { + "epoch": 1.2584297317253261, + "grad_norm": 0.8441982269287109, + "learning_rate": 4.499602897562921e-06, + "loss": 0.6246, + "step": 3409 + }, + { + "epoch": 1.2587989170563623, + "grad_norm": 0.8549045920372009, + "learning_rate": 4.499310931811976e-06, + "loss": 0.6453, + "step": 3410 + }, + { + "epoch": 1.2591681023873984, + "grad_norm": 0.9006457924842834, + "learning_rate": 4.499018890387744e-06, + "loss": 0.6616, + "step": 3411 + }, + { + "epoch": 1.2595372877184348, + "grad_norm": 0.8284814953804016, + "learning_rate": 4.498726773301277e-06, + "loss": 0.6408, + "step": 3412 + }, + { + "epoch": 1.259906473049471, + "grad_norm": 0.8476769328117371, + "learning_rate": 4.498434580563634e-06, + "loss": 0.6572, + "step": 3413 + }, + { + "epoch": 1.260275658380507, + "grad_norm": 0.8859312534332275, + "learning_rate": 4.4981423121858716e-06, + "loss": 0.6572, + "step": 3414 + }, + { + "epoch": 1.2606448437115432, + "grad_norm": 0.9216022491455078, + "learning_rate": 4.497849968179053e-06, + "loss": 0.6837, + "step": 3415 + }, + { + "epoch": 1.2610140290425793, + "grad_norm": 0.8576871156692505, + "learning_rate": 4.497557548554245e-06, + "loss": 0.665, + "step": 3416 + }, + { + "epoch": 1.2613832143736157, + "grad_norm": 0.8900001645088196, + "learning_rate": 4.497265053322512e-06, + "loss": 0.6772, + "step": 3417 + }, + { + "epoch": 1.2617523997046518, + "grad_norm": 0.855173647403717, + "learning_rate": 4.496972482494929e-06, + "loss": 0.6358, + "step": 3418 + }, + { + "epoch": 1.262121585035688, + "grad_norm": 0.877885639667511, + "learning_rate": 4.496679836082567e-06, + "loss": 0.6631, + "step": 3419 + }, + { + "epoch": 1.262490770366724, + "grad_norm": 0.8535313606262207, + "learning_rate": 4.496387114096503e-06, + "loss": 0.6311, + "step": 3420 + }, + { + "epoch": 1.2628599556977602, + "grad_norm": 0.8361259698867798, + "learning_rate": 4.496094316547818e-06, + "loss": 0.629, + "step": 3421 + }, + { + "epoch": 1.2632291410287966, + "grad_norm": 0.8624289631843567, + "learning_rate": 4.495801443447592e-06, + "loss": 0.6821, + "step": 3422 + }, + { + "epoch": 1.2635983263598327, + "grad_norm": 0.8534937500953674, + "learning_rate": 4.495508494806912e-06, + "loss": 0.6445, + "step": 3423 + }, + { + "epoch": 1.2639675116908689, + "grad_norm": 0.8931602835655212, + "learning_rate": 4.495215470636865e-06, + "loss": 0.6891, + "step": 3424 + }, + { + "epoch": 1.264336697021905, + "grad_norm": 0.9143256545066833, + "learning_rate": 4.494922370948542e-06, + "loss": 0.6427, + "step": 3425 + }, + { + "epoch": 1.2647058823529411, + "grad_norm": 0.8678403496742249, + "learning_rate": 4.494629195753037e-06, + "loss": 0.672, + "step": 3426 + }, + { + "epoch": 1.2650750676839775, + "grad_norm": 0.8680613040924072, + "learning_rate": 4.494335945061446e-06, + "loss": 0.6635, + "step": 3427 + }, + { + "epoch": 1.2654442530150134, + "grad_norm": 0.8901352882385254, + "learning_rate": 4.49404261888487e-06, + "loss": 0.6855, + "step": 3428 + }, + { + "epoch": 1.2658134383460498, + "grad_norm": 0.8658345341682434, + "learning_rate": 4.493749217234409e-06, + "loss": 0.671, + "step": 3429 + }, + { + "epoch": 1.266182623677086, + "grad_norm": 0.8942663073539734, + "learning_rate": 4.49345574012117e-06, + "loss": 0.6518, + "step": 3430 + }, + { + "epoch": 1.266551809008122, + "grad_norm": 0.8919180035591125, + "learning_rate": 4.493162187556261e-06, + "loss": 0.6717, + "step": 3431 + }, + { + "epoch": 1.2669209943391584, + "grad_norm": 0.9057881236076355, + "learning_rate": 4.4928685595507906e-06, + "loss": 0.6631, + "step": 3432 + }, + { + "epoch": 1.2672901796701943, + "grad_norm": 0.8948836922645569, + "learning_rate": 4.492574856115875e-06, + "loss": 0.6976, + "step": 3433 + }, + { + "epoch": 1.2676593650012307, + "grad_norm": 0.8810674548149109, + "learning_rate": 4.49228107726263e-06, + "loss": 0.6826, + "step": 3434 + }, + { + "epoch": 1.2680285503322668, + "grad_norm": 0.8995678424835205, + "learning_rate": 4.491987223002175e-06, + "loss": 0.7195, + "step": 3435 + }, + { + "epoch": 1.268397735663303, + "grad_norm": 0.8743701577186584, + "learning_rate": 4.4916932933456325e-06, + "loss": 0.6493, + "step": 3436 + }, + { + "epoch": 1.268766920994339, + "grad_norm": 0.8380770683288574, + "learning_rate": 4.491399288304127e-06, + "loss": 0.6985, + "step": 3437 + }, + { + "epoch": 1.2691361063253752, + "grad_norm": 0.8987215161323547, + "learning_rate": 4.4911052078887875e-06, + "loss": 0.6751, + "step": 3438 + }, + { + "epoch": 1.2695052916564116, + "grad_norm": 0.8774052262306213, + "learning_rate": 4.490811052110744e-06, + "loss": 0.6438, + "step": 3439 + }, + { + "epoch": 1.2698744769874477, + "grad_norm": 0.8479379415512085, + "learning_rate": 4.49051682098113e-06, + "loss": 0.6711, + "step": 3440 + }, + { + "epoch": 1.2702436623184838, + "grad_norm": 0.8776026964187622, + "learning_rate": 4.4902225145110835e-06, + "loss": 0.6693, + "step": 3441 + }, + { + "epoch": 1.27061284764952, + "grad_norm": 0.8995134830474854, + "learning_rate": 4.489928132711742e-06, + "loss": 0.6674, + "step": 3442 + }, + { + "epoch": 1.2709820329805561, + "grad_norm": 0.9024117588996887, + "learning_rate": 4.489633675594249e-06, + "loss": 0.6719, + "step": 3443 + }, + { + "epoch": 1.2713512183115925, + "grad_norm": 0.9263385534286499, + "learning_rate": 4.489339143169749e-06, + "loss": 0.6587, + "step": 3444 + }, + { + "epoch": 1.2717204036426286, + "grad_norm": 0.8908492922782898, + "learning_rate": 4.489044535449392e-06, + "loss": 0.6581, + "step": 3445 + }, + { + "epoch": 1.2720895889736648, + "grad_norm": 0.9317528009414673, + "learning_rate": 4.4887498524443254e-06, + "loss": 0.6523, + "step": 3446 + }, + { + "epoch": 1.272458774304701, + "grad_norm": 0.884556233882904, + "learning_rate": 4.4884550941657045e-06, + "loss": 0.6844, + "step": 3447 + }, + { + "epoch": 1.272827959635737, + "grad_norm": 0.8590368628501892, + "learning_rate": 4.488160260624687e-06, + "loss": 0.6614, + "step": 3448 + }, + { + "epoch": 1.2731971449667734, + "grad_norm": 0.8445407748222351, + "learning_rate": 4.48786535183243e-06, + "loss": 0.6844, + "step": 3449 + }, + { + "epoch": 1.2735663302978095, + "grad_norm": 0.8684682846069336, + "learning_rate": 4.487570367800098e-06, + "loss": 0.6923, + "step": 3450 + }, + { + "epoch": 1.2739355156288457, + "grad_norm": 0.9021221995353699, + "learning_rate": 4.4872753085388536e-06, + "loss": 0.6311, + "step": 3451 + }, + { + "epoch": 1.2743047009598818, + "grad_norm": 0.8524183630943298, + "learning_rate": 4.4869801740598664e-06, + "loss": 0.644, + "step": 3452 + }, + { + "epoch": 1.274673886290918, + "grad_norm": 0.8531653881072998, + "learning_rate": 4.486684964374307e-06, + "loss": 0.6188, + "step": 3453 + }, + { + "epoch": 1.2750430716219543, + "grad_norm": 0.8710168600082397, + "learning_rate": 4.486389679493348e-06, + "loss": 0.6781, + "step": 3454 + }, + { + "epoch": 1.2754122569529904, + "grad_norm": 0.8830646872520447, + "learning_rate": 4.486094319428168e-06, + "loss": 0.7208, + "step": 3455 + }, + { + "epoch": 1.2757814422840266, + "grad_norm": 0.8798280954360962, + "learning_rate": 4.485798884189944e-06, + "loss": 0.6771, + "step": 3456 + }, + { + "epoch": 1.2761506276150627, + "grad_norm": 0.8851190805435181, + "learning_rate": 4.485503373789858e-06, + "loss": 0.6848, + "step": 3457 + }, + { + "epoch": 1.2765198129460988, + "grad_norm": 0.8814430236816406, + "learning_rate": 4.485207788239097e-06, + "loss": 0.6294, + "step": 3458 + }, + { + "epoch": 1.2768889982771352, + "grad_norm": 0.8164977431297302, + "learning_rate": 4.484912127548847e-06, + "loss": 0.6601, + "step": 3459 + }, + { + "epoch": 1.2772581836081713, + "grad_norm": 0.9215435981750488, + "learning_rate": 4.4846163917303e-06, + "loss": 0.6057, + "step": 3460 + }, + { + "epoch": 1.2776273689392075, + "grad_norm": 0.8647847175598145, + "learning_rate": 4.484320580794649e-06, + "loss": 0.6761, + "step": 3461 + }, + { + "epoch": 1.2779965542702436, + "grad_norm": 0.8772610425949097, + "learning_rate": 4.484024694753091e-06, + "loss": 0.6932, + "step": 3462 + }, + { + "epoch": 1.2783657396012797, + "grad_norm": 0.892145037651062, + "learning_rate": 4.483728733616824e-06, + "loss": 0.6943, + "step": 3463 + }, + { + "epoch": 1.278734924932316, + "grad_norm": 0.841188907623291, + "learning_rate": 4.48343269739705e-06, + "loss": 0.6662, + "step": 3464 + }, + { + "epoch": 1.2791041102633522, + "grad_norm": 0.9026973247528076, + "learning_rate": 4.4831365861049735e-06, + "loss": 0.6673, + "step": 3465 + }, + { + "epoch": 1.2794732955943884, + "grad_norm": 0.8679662346839905, + "learning_rate": 4.482840399751805e-06, + "loss": 0.6223, + "step": 3466 + }, + { + "epoch": 1.2798424809254245, + "grad_norm": 0.8782039880752563, + "learning_rate": 4.482544138348752e-06, + "loss": 0.6582, + "step": 3467 + }, + { + "epoch": 1.2802116662564607, + "grad_norm": 0.865569531917572, + "learning_rate": 4.482247801907029e-06, + "loss": 0.6884, + "step": 3468 + }, + { + "epoch": 1.280580851587497, + "grad_norm": 0.8688008785247803, + "learning_rate": 4.481951390437852e-06, + "loss": 0.6204, + "step": 3469 + }, + { + "epoch": 1.2809500369185332, + "grad_norm": 0.8695070147514343, + "learning_rate": 4.481654903952441e-06, + "loss": 0.6541, + "step": 3470 + }, + { + "epoch": 1.2813192222495693, + "grad_norm": 0.8829357028007507, + "learning_rate": 4.481358342462017e-06, + "loss": 0.645, + "step": 3471 + }, + { + "epoch": 1.2816884075806054, + "grad_norm": 0.8592128157615662, + "learning_rate": 4.481061705977806e-06, + "loss": 0.6596, + "step": 3472 + }, + { + "epoch": 1.2820575929116416, + "grad_norm": 0.8336278200149536, + "learning_rate": 4.480764994511035e-06, + "loss": 0.6905, + "step": 3473 + }, + { + "epoch": 1.282426778242678, + "grad_norm": 0.8724321126937866, + "learning_rate": 4.4804682080729325e-06, + "loss": 0.6109, + "step": 3474 + }, + { + "epoch": 1.282795963573714, + "grad_norm": 0.8557796478271484, + "learning_rate": 4.4801713466747345e-06, + "loss": 0.6856, + "step": 3475 + }, + { + "epoch": 1.2831651489047502, + "grad_norm": 0.8673741221427917, + "learning_rate": 4.479874410327675e-06, + "loss": 0.6669, + "step": 3476 + }, + { + "epoch": 1.2835343342357863, + "grad_norm": 0.9282702803611755, + "learning_rate": 4.479577399042995e-06, + "loss": 0.6793, + "step": 3477 + }, + { + "epoch": 1.2839035195668225, + "grad_norm": 0.8718639016151428, + "learning_rate": 4.479280312831935e-06, + "loss": 0.6538, + "step": 3478 + }, + { + "epoch": 1.2842727048978588, + "grad_norm": 0.8712838888168335, + "learning_rate": 4.47898315170574e-06, + "loss": 0.6711, + "step": 3479 + }, + { + "epoch": 1.284641890228895, + "grad_norm": 0.8944904804229736, + "learning_rate": 4.478685915675656e-06, + "loss": 0.7259, + "step": 3480 + }, + { + "epoch": 1.285011075559931, + "grad_norm": 0.9143025279045105, + "learning_rate": 4.478388604752934e-06, + "loss": 0.6888, + "step": 3481 + }, + { + "epoch": 1.2853802608909672, + "grad_norm": 0.9140299558639526, + "learning_rate": 4.47809121894883e-06, + "loss": 0.6492, + "step": 3482 + }, + { + "epoch": 1.2857494462220034, + "grad_norm": 0.863661527633667, + "learning_rate": 4.477793758274596e-06, + "loss": 0.6825, + "step": 3483 + }, + { + "epoch": 1.2861186315530397, + "grad_norm": 0.8772644996643066, + "learning_rate": 4.477496222741494e-06, + "loss": 0.6445, + "step": 3484 + }, + { + "epoch": 1.2864878168840759, + "grad_norm": 0.864403486251831, + "learning_rate": 4.477198612360783e-06, + "loss": 0.6559, + "step": 3485 + }, + { + "epoch": 1.286857002215112, + "grad_norm": 0.8510833382606506, + "learning_rate": 4.476900927143729e-06, + "loss": 0.6725, + "step": 3486 + }, + { + "epoch": 1.2872261875461481, + "grad_norm": 0.8688685297966003, + "learning_rate": 4.476603167101599e-06, + "loss": 0.6471, + "step": 3487 + }, + { + "epoch": 1.2875953728771843, + "grad_norm": 0.889005184173584, + "learning_rate": 4.476305332245662e-06, + "loss": 0.6752, + "step": 3488 + }, + { + "epoch": 1.2879645582082206, + "grad_norm": 0.8784791231155396, + "learning_rate": 4.476007422587192e-06, + "loss": 0.6556, + "step": 3489 + }, + { + "epoch": 1.2883337435392568, + "grad_norm": 0.8266776204109192, + "learning_rate": 4.475709438137466e-06, + "loss": 0.6502, + "step": 3490 + }, + { + "epoch": 1.288702928870293, + "grad_norm": 0.9370036721229553, + "learning_rate": 4.47541137890776e-06, + "loss": 0.6388, + "step": 3491 + }, + { + "epoch": 1.289072114201329, + "grad_norm": 0.8785253763198853, + "learning_rate": 4.475113244909357e-06, + "loss": 0.6754, + "step": 3492 + }, + { + "epoch": 1.2894412995323652, + "grad_norm": 0.8789678812026978, + "learning_rate": 4.4748150361535415e-06, + "loss": 0.7097, + "step": 3493 + }, + { + "epoch": 1.2898104848634016, + "grad_norm": 0.9050271511077881, + "learning_rate": 4.4745167526515995e-06, + "loss": 0.6867, + "step": 3494 + }, + { + "epoch": 1.2901796701944377, + "grad_norm": 0.8990874290466309, + "learning_rate": 4.474218394414822e-06, + "loss": 0.6569, + "step": 3495 + }, + { + "epoch": 1.2905488555254738, + "grad_norm": 0.8633016347885132, + "learning_rate": 4.473919961454501e-06, + "loss": 0.6559, + "step": 3496 + }, + { + "epoch": 1.29091804085651, + "grad_norm": 0.8582134246826172, + "learning_rate": 4.473621453781933e-06, + "loss": 0.6228, + "step": 3497 + }, + { + "epoch": 1.291287226187546, + "grad_norm": 0.8260644674301147, + "learning_rate": 4.473322871408416e-06, + "loss": 0.618, + "step": 3498 + }, + { + "epoch": 1.2916564115185825, + "grad_norm": 0.8730311393737793, + "learning_rate": 4.473024214345251e-06, + "loss": 0.6818, + "step": 3499 + }, + { + "epoch": 1.2920255968496184, + "grad_norm": 0.9131650924682617, + "learning_rate": 4.472725482603742e-06, + "loss": 0.7146, + "step": 3500 + }, + { + "epoch": 1.2923947821806547, + "grad_norm": 0.8529634475708008, + "learning_rate": 4.472426676195195e-06, + "loss": 0.6772, + "step": 3501 + }, + { + "epoch": 1.2927639675116909, + "grad_norm": 0.8632826209068298, + "learning_rate": 4.4721277951309226e-06, + "loss": 0.6846, + "step": 3502 + }, + { + "epoch": 1.293133152842727, + "grad_norm": 0.8923689723014832, + "learning_rate": 4.471828839422234e-06, + "loss": 0.6386, + "step": 3503 + }, + { + "epoch": 1.2935023381737634, + "grad_norm": 0.8838045597076416, + "learning_rate": 4.471529809080447e-06, + "loss": 0.6865, + "step": 3504 + }, + { + "epoch": 1.2938715235047993, + "grad_norm": 0.9002860188484192, + "learning_rate": 4.471230704116879e-06, + "loss": 0.6979, + "step": 3505 + }, + { + "epoch": 1.2942407088358356, + "grad_norm": 0.8960261344909668, + "learning_rate": 4.470931524542851e-06, + "loss": 0.6632, + "step": 3506 + }, + { + "epoch": 1.2946098941668718, + "grad_norm": 0.8780227899551392, + "learning_rate": 4.470632270369687e-06, + "loss": 0.6521, + "step": 3507 + }, + { + "epoch": 1.294979079497908, + "grad_norm": 0.9066250324249268, + "learning_rate": 4.470332941608713e-06, + "loss": 0.6217, + "step": 3508 + }, + { + "epoch": 1.295348264828944, + "grad_norm": 0.8582140803337097, + "learning_rate": 4.470033538271259e-06, + "loss": 0.6529, + "step": 3509 + }, + { + "epoch": 1.2957174501599802, + "grad_norm": 0.898539662361145, + "learning_rate": 4.469734060368658e-06, + "loss": 0.6541, + "step": 3510 + }, + { + "epoch": 1.2960866354910165, + "grad_norm": 0.878725528717041, + "learning_rate": 4.469434507912243e-06, + "loss": 0.6333, + "step": 3511 + }, + { + "epoch": 1.2964558208220527, + "grad_norm": 0.8806296586990356, + "learning_rate": 4.469134880913355e-06, + "loss": 0.6494, + "step": 3512 + }, + { + "epoch": 1.2968250061530888, + "grad_norm": 0.8821362853050232, + "learning_rate": 4.4688351793833325e-06, + "loss": 0.6339, + "step": 3513 + }, + { + "epoch": 1.297194191484125, + "grad_norm": 0.8555250763893127, + "learning_rate": 4.46853540333352e-06, + "loss": 0.6885, + "step": 3514 + }, + { + "epoch": 1.297563376815161, + "grad_norm": 0.8665767908096313, + "learning_rate": 4.468235552775264e-06, + "loss": 0.6077, + "step": 3515 + }, + { + "epoch": 1.2979325621461975, + "grad_norm": 0.8498830199241638, + "learning_rate": 4.467935627719913e-06, + "loss": 0.6472, + "step": 3516 + }, + { + "epoch": 1.2983017474772336, + "grad_norm": 0.8488712906837463, + "learning_rate": 4.46763562817882e-06, + "loss": 0.6395, + "step": 3517 + }, + { + "epoch": 1.2986709328082697, + "grad_norm": 0.8722914457321167, + "learning_rate": 4.467335554163339e-06, + "loss": 0.6578, + "step": 3518 + }, + { + "epoch": 1.2990401181393059, + "grad_norm": 0.836914598941803, + "learning_rate": 4.467035405684829e-06, + "loss": 0.6398, + "step": 3519 + }, + { + "epoch": 1.299409303470342, + "grad_norm": 0.8517598509788513, + "learning_rate": 4.466735182754649e-06, + "loss": 0.7064, + "step": 3520 + }, + { + "epoch": 1.2997784888013784, + "grad_norm": 0.8806140422821045, + "learning_rate": 4.466434885384163e-06, + "loss": 0.6606, + "step": 3521 + }, + { + "epoch": 1.3001476741324145, + "grad_norm": 0.8740143179893494, + "learning_rate": 4.466134513584738e-06, + "loss": 0.6565, + "step": 3522 + }, + { + "epoch": 1.3005168594634506, + "grad_norm": 0.8923680186271667, + "learning_rate": 4.4658340673677416e-06, + "loss": 0.6844, + "step": 3523 + }, + { + "epoch": 1.3008860447944868, + "grad_norm": 0.8765956163406372, + "learning_rate": 4.465533546744546e-06, + "loss": 0.6815, + "step": 3524 + }, + { + "epoch": 1.301255230125523, + "grad_norm": 0.8716487884521484, + "learning_rate": 4.465232951726526e-06, + "loss": 0.7116, + "step": 3525 + }, + { + "epoch": 1.3016244154565593, + "grad_norm": 0.8648078441619873, + "learning_rate": 4.464932282325059e-06, + "loss": 0.7036, + "step": 3526 + }, + { + "epoch": 1.3019936007875954, + "grad_norm": 0.9018440842628479, + "learning_rate": 4.4646315385515254e-06, + "loss": 0.7332, + "step": 3527 + }, + { + "epoch": 1.3023627861186315, + "grad_norm": 0.897003173828125, + "learning_rate": 4.464330720417308e-06, + "loss": 0.6554, + "step": 3528 + }, + { + "epoch": 1.3027319714496677, + "grad_norm": 0.8863136768341064, + "learning_rate": 4.4640298279337935e-06, + "loss": 0.6494, + "step": 3529 + }, + { + "epoch": 1.3031011567807038, + "grad_norm": 0.8514355421066284, + "learning_rate": 4.463728861112369e-06, + "loss": 0.6647, + "step": 3530 + }, + { + "epoch": 1.3034703421117402, + "grad_norm": 0.862873911857605, + "learning_rate": 4.463427819964427e-06, + "loss": 0.6732, + "step": 3531 + }, + { + "epoch": 1.3038395274427763, + "grad_norm": 0.8828049898147583, + "learning_rate": 4.463126704501362e-06, + "loss": 0.7031, + "step": 3532 + }, + { + "epoch": 1.3042087127738125, + "grad_norm": 0.8639241456985474, + "learning_rate": 4.462825514734571e-06, + "loss": 0.6804, + "step": 3533 + }, + { + "epoch": 1.3045778981048486, + "grad_norm": 0.8756214380264282, + "learning_rate": 4.462524250675453e-06, + "loss": 0.6428, + "step": 3534 + }, + { + "epoch": 1.3049470834358847, + "grad_norm": 0.8646184206008911, + "learning_rate": 4.462222912335412e-06, + "loss": 0.6478, + "step": 3535 + }, + { + "epoch": 1.305316268766921, + "grad_norm": 0.8731788992881775, + "learning_rate": 4.461921499725853e-06, + "loss": 0.6626, + "step": 3536 + }, + { + "epoch": 1.3056854540979572, + "grad_norm": 0.9140742421150208, + "learning_rate": 4.461620012858184e-06, + "loss": 0.6437, + "step": 3537 + }, + { + "epoch": 1.3060546394289934, + "grad_norm": 0.9103720784187317, + "learning_rate": 4.461318451743817e-06, + "loss": 0.635, + "step": 3538 + }, + { + "epoch": 1.3064238247600295, + "grad_norm": 0.9145829081535339, + "learning_rate": 4.461016816394166e-06, + "loss": 0.6775, + "step": 3539 + }, + { + "epoch": 1.3067930100910656, + "grad_norm": 0.8595937490463257, + "learning_rate": 4.460715106820647e-06, + "loss": 0.6452, + "step": 3540 + }, + { + "epoch": 1.307162195422102, + "grad_norm": 0.9261997938156128, + "learning_rate": 4.46041332303468e-06, + "loss": 0.6609, + "step": 3541 + }, + { + "epoch": 1.3075313807531381, + "grad_norm": 0.8920140266418457, + "learning_rate": 4.460111465047687e-06, + "loss": 0.6742, + "step": 3542 + }, + { + "epoch": 1.3079005660841743, + "grad_norm": 0.8888839483261108, + "learning_rate": 4.459809532871094e-06, + "loss": 0.6665, + "step": 3543 + }, + { + "epoch": 1.3082697514152104, + "grad_norm": 0.8753849864006042, + "learning_rate": 4.45950752651633e-06, + "loss": 0.7154, + "step": 3544 + }, + { + "epoch": 1.3086389367462465, + "grad_norm": 0.8806453943252563, + "learning_rate": 4.459205445994823e-06, + "loss": 0.696, + "step": 3545 + }, + { + "epoch": 1.309008122077283, + "grad_norm": 0.8916463255882263, + "learning_rate": 4.458903291318008e-06, + "loss": 0.6659, + "step": 3546 + }, + { + "epoch": 1.309377307408319, + "grad_norm": 0.9062025547027588, + "learning_rate": 4.458601062497323e-06, + "loss": 0.6715, + "step": 3547 + }, + { + "epoch": 1.3097464927393552, + "grad_norm": 0.8969030380249023, + "learning_rate": 4.458298759544205e-06, + "loss": 0.6442, + "step": 3548 + }, + { + "epoch": 1.3101156780703913, + "grad_norm": 0.8767018914222717, + "learning_rate": 4.457996382470097e-06, + "loss": 0.631, + "step": 3549 + }, + { + "epoch": 1.3104848634014274, + "grad_norm": 0.8527156710624695, + "learning_rate": 4.457693931286444e-06, + "loss": 0.6619, + "step": 3550 + }, + { + "epoch": 1.3108540487324638, + "grad_norm": 0.8597617149353027, + "learning_rate": 4.4573914060046945e-06, + "loss": 0.6494, + "step": 3551 + }, + { + "epoch": 1.3112232340635, + "grad_norm": 0.8826417922973633, + "learning_rate": 4.457088806636296e-06, + "loss": 0.6525, + "step": 3552 + }, + { + "epoch": 1.311592419394536, + "grad_norm": 0.8741385340690613, + "learning_rate": 4.456786133192706e-06, + "loss": 0.6726, + "step": 3553 + }, + { + "epoch": 1.3119616047255722, + "grad_norm": 0.8730423450469971, + "learning_rate": 4.4564833856853765e-06, + "loss": 0.6927, + "step": 3554 + }, + { + "epoch": 1.3123307900566084, + "grad_norm": 0.9085260629653931, + "learning_rate": 4.456180564125769e-06, + "loss": 0.6787, + "step": 3555 + }, + { + "epoch": 1.3126999753876447, + "grad_norm": 0.9021422863006592, + "learning_rate": 4.455877668525346e-06, + "loss": 0.6785, + "step": 3556 + }, + { + "epoch": 1.3130691607186809, + "grad_norm": 0.9034458994865417, + "learning_rate": 4.4555746988955684e-06, + "loss": 0.6207, + "step": 3557 + }, + { + "epoch": 1.313438346049717, + "grad_norm": 0.8622426390647888, + "learning_rate": 4.455271655247907e-06, + "loss": 0.6824, + "step": 3558 + }, + { + "epoch": 1.3138075313807531, + "grad_norm": 0.9064953923225403, + "learning_rate": 4.4549685375938305e-06, + "loss": 0.6959, + "step": 3559 + }, + { + "epoch": 1.3141767167117893, + "grad_norm": 0.9044845700263977, + "learning_rate": 4.454665345944811e-06, + "loss": 0.6973, + "step": 3560 + }, + { + "epoch": 1.3145459020428256, + "grad_norm": 0.9149091839790344, + "learning_rate": 4.454362080312325e-06, + "loss": 0.6872, + "step": 3561 + }, + { + "epoch": 1.3149150873738618, + "grad_norm": 0.9013177752494812, + "learning_rate": 4.454058740707852e-06, + "loss": 0.6776, + "step": 3562 + }, + { + "epoch": 1.315284272704898, + "grad_norm": 0.9099971055984497, + "learning_rate": 4.4537553271428715e-06, + "loss": 0.6328, + "step": 3563 + }, + { + "epoch": 1.315653458035934, + "grad_norm": 0.8542749285697937, + "learning_rate": 4.4534518396288696e-06, + "loss": 0.6286, + "step": 3564 + }, + { + "epoch": 1.3160226433669702, + "grad_norm": 0.856983482837677, + "learning_rate": 4.453148278177331e-06, + "loss": 0.6447, + "step": 3565 + }, + { + "epoch": 1.3163918286980065, + "grad_norm": 0.9067758917808533, + "learning_rate": 4.452844642799747e-06, + "loss": 0.667, + "step": 3566 + }, + { + "epoch": 1.3167610140290424, + "grad_norm": 0.9104489088058472, + "learning_rate": 4.452540933507609e-06, + "loss": 0.6457, + "step": 3567 + }, + { + "epoch": 1.3171301993600788, + "grad_norm": 0.8759661912918091, + "learning_rate": 4.452237150312414e-06, + "loss": 0.6897, + "step": 3568 + }, + { + "epoch": 1.317499384691115, + "grad_norm": 0.8820813298225403, + "learning_rate": 4.451933293225658e-06, + "loss": 0.6626, + "step": 3569 + }, + { + "epoch": 1.317868570022151, + "grad_norm": 0.8949965834617615, + "learning_rate": 4.451629362258843e-06, + "loss": 0.7055, + "step": 3570 + }, + { + "epoch": 1.3182377553531874, + "grad_norm": 0.8746169805526733, + "learning_rate": 4.451325357423474e-06, + "loss": 0.6289, + "step": 3571 + }, + { + "epoch": 1.3186069406842233, + "grad_norm": 0.8699045777320862, + "learning_rate": 4.451021278731054e-06, + "loss": 0.6743, + "step": 3572 + }, + { + "epoch": 1.3189761260152597, + "grad_norm": 0.8615305423736572, + "learning_rate": 4.450717126193096e-06, + "loss": 0.651, + "step": 3573 + }, + { + "epoch": 1.3193453113462958, + "grad_norm": 0.8749846816062927, + "learning_rate": 4.45041289982111e-06, + "loss": 0.6389, + "step": 3574 + }, + { + "epoch": 1.319714496677332, + "grad_norm": 0.8373249173164368, + "learning_rate": 4.45010859962661e-06, + "loss": 0.6673, + "step": 3575 + }, + { + "epoch": 1.3200836820083681, + "grad_norm": 0.8759661316871643, + "learning_rate": 4.449804225621116e-06, + "loss": 0.6547, + "step": 3576 + }, + { + "epoch": 1.3204528673394043, + "grad_norm": 0.8600711822509766, + "learning_rate": 4.4494997778161474e-06, + "loss": 0.6696, + "step": 3577 + }, + { + "epoch": 1.3208220526704406, + "grad_norm": 0.8885278105735779, + "learning_rate": 4.449195256223227e-06, + "loss": 0.6538, + "step": 3578 + }, + { + "epoch": 1.3211912380014768, + "grad_norm": 0.8988562226295471, + "learning_rate": 4.448890660853883e-06, + "loss": 0.6075, + "step": 3579 + }, + { + "epoch": 1.321560423332513, + "grad_norm": 0.8475468754768372, + "learning_rate": 4.4485859917196415e-06, + "loss": 0.6794, + "step": 3580 + }, + { + "epoch": 1.321929608663549, + "grad_norm": 0.8829089999198914, + "learning_rate": 4.4482812488320345e-06, + "loss": 0.652, + "step": 3581 + }, + { + "epoch": 1.3222987939945852, + "grad_norm": 0.8843732476234436, + "learning_rate": 4.447976432202597e-06, + "loss": 0.6778, + "step": 3582 + }, + { + "epoch": 1.3226679793256215, + "grad_norm": 0.917571485042572, + "learning_rate": 4.447671541842869e-06, + "loss": 0.631, + "step": 3583 + }, + { + "epoch": 1.3230371646566577, + "grad_norm": 0.839386522769928, + "learning_rate": 4.447366577764385e-06, + "loss": 0.631, + "step": 3584 + }, + { + "epoch": 1.3234063499876938, + "grad_norm": 0.8923275470733643, + "learning_rate": 4.447061539978691e-06, + "loss": 0.6615, + "step": 3585 + }, + { + "epoch": 1.32377553531873, + "grad_norm": 0.8693921566009521, + "learning_rate": 4.446756428497333e-06, + "loss": 0.6659, + "step": 3586 + }, + { + "epoch": 1.324144720649766, + "grad_norm": 0.9069177508354187, + "learning_rate": 4.446451243331858e-06, + "loss": 0.6528, + "step": 3587 + }, + { + "epoch": 1.3245139059808024, + "grad_norm": 0.8909980654716492, + "learning_rate": 4.446145984493818e-06, + "loss": 0.6552, + "step": 3588 + }, + { + "epoch": 1.3248830913118386, + "grad_norm": 0.848107099533081, + "learning_rate": 4.445840651994767e-06, + "loss": 0.6644, + "step": 3589 + }, + { + "epoch": 1.3252522766428747, + "grad_norm": 0.8829138278961182, + "learning_rate": 4.445535245846261e-06, + "loss": 0.6908, + "step": 3590 + }, + { + "epoch": 1.3256214619739108, + "grad_norm": 0.8904527425765991, + "learning_rate": 4.445229766059861e-06, + "loss": 0.6577, + "step": 3591 + }, + { + "epoch": 1.325990647304947, + "grad_norm": 0.8691785931587219, + "learning_rate": 4.444924212647127e-06, + "loss": 0.687, + "step": 3592 + }, + { + "epoch": 1.3263598326359833, + "grad_norm": 0.8996774554252625, + "learning_rate": 4.4446185856196255e-06, + "loss": 0.6614, + "step": 3593 + }, + { + "epoch": 1.3267290179670195, + "grad_norm": 0.8735383749008179, + "learning_rate": 4.444312884988925e-06, + "loss": 0.6219, + "step": 3594 + }, + { + "epoch": 1.3270982032980556, + "grad_norm": 0.8730587959289551, + "learning_rate": 4.444007110766595e-06, + "loss": 0.7343, + "step": 3595 + }, + { + "epoch": 1.3274673886290917, + "grad_norm": 0.9165945649147034, + "learning_rate": 4.44370126296421e-06, + "loss": 0.6854, + "step": 3596 + }, + { + "epoch": 1.3278365739601279, + "grad_norm": 0.8799672722816467, + "learning_rate": 4.443395341593344e-06, + "loss": 0.6164, + "step": 3597 + }, + { + "epoch": 1.3282057592911642, + "grad_norm": 0.861463189125061, + "learning_rate": 4.443089346665579e-06, + "loss": 0.6943, + "step": 3598 + }, + { + "epoch": 1.3285749446222004, + "grad_norm": 0.9101953506469727, + "learning_rate": 4.442783278192495e-06, + "loss": 0.6911, + "step": 3599 + }, + { + "epoch": 1.3289441299532365, + "grad_norm": 0.8630183339118958, + "learning_rate": 4.4424771361856776e-06, + "loss": 0.6599, + "step": 3600 + }, + { + "epoch": 1.3293133152842727, + "grad_norm": 0.8793606162071228, + "learning_rate": 4.442170920656713e-06, + "loss": 0.6803, + "step": 3601 + }, + { + "epoch": 1.3296825006153088, + "grad_norm": 0.9272857904434204, + "learning_rate": 4.441864631617193e-06, + "loss": 0.655, + "step": 3602 + }, + { + "epoch": 1.3300516859463452, + "grad_norm": 0.8646690845489502, + "learning_rate": 4.441558269078708e-06, + "loss": 0.6743, + "step": 3603 + }, + { + "epoch": 1.3304208712773813, + "grad_norm": 0.8722865581512451, + "learning_rate": 4.441251833052857e-06, + "loss": 0.6295, + "step": 3604 + }, + { + "epoch": 1.3307900566084174, + "grad_norm": 0.8462452292442322, + "learning_rate": 4.440945323551236e-06, + "loss": 0.6446, + "step": 3605 + }, + { + "epoch": 1.3311592419394536, + "grad_norm": 0.8822388648986816, + "learning_rate": 4.440638740585447e-06, + "loss": 0.6604, + "step": 3606 + }, + { + "epoch": 1.3315284272704897, + "grad_norm": 0.871185839176178, + "learning_rate": 4.440332084167094e-06, + "loss": 0.7219, + "step": 3607 + }, + { + "epoch": 1.331897612601526, + "grad_norm": 0.8862301707267761, + "learning_rate": 4.440025354307784e-06, + "loss": 0.6615, + "step": 3608 + }, + { + "epoch": 1.3322667979325622, + "grad_norm": 0.871020495891571, + "learning_rate": 4.439718551019127e-06, + "loss": 0.6984, + "step": 3609 + }, + { + "epoch": 1.3326359832635983, + "grad_norm": 0.9101143479347229, + "learning_rate": 4.439411674312735e-06, + "loss": 0.675, + "step": 3610 + }, + { + "epoch": 1.3330051685946345, + "grad_norm": 0.8410690426826477, + "learning_rate": 4.439104724200223e-06, + "loss": 0.6346, + "step": 3611 + }, + { + "epoch": 1.3333743539256706, + "grad_norm": 0.8770188093185425, + "learning_rate": 4.438797700693209e-06, + "loss": 0.6665, + "step": 3612 + }, + { + "epoch": 1.333743539256707, + "grad_norm": 0.8892994523048401, + "learning_rate": 4.438490603803314e-06, + "loss": 0.6772, + "step": 3613 + }, + { + "epoch": 1.334112724587743, + "grad_norm": 0.911948561668396, + "learning_rate": 4.438183433542162e-06, + "loss": 0.6704, + "step": 3614 + }, + { + "epoch": 1.3344819099187792, + "grad_norm": 0.9107742309570312, + "learning_rate": 4.437876189921377e-06, + "loss": 0.7037, + "step": 3615 + }, + { + "epoch": 1.3348510952498154, + "grad_norm": 0.9249632358551025, + "learning_rate": 4.437568872952592e-06, + "loss": 0.6376, + "step": 3616 + }, + { + "epoch": 1.3352202805808515, + "grad_norm": 0.8617792725563049, + "learning_rate": 4.437261482647436e-06, + "loss": 0.6745, + "step": 3617 + }, + { + "epoch": 1.3355894659118879, + "grad_norm": 0.8654313683509827, + "learning_rate": 4.436954019017544e-06, + "loss": 0.6623, + "step": 3618 + }, + { + "epoch": 1.335958651242924, + "grad_norm": 0.868438720703125, + "learning_rate": 4.436646482074554e-06, + "loss": 0.6621, + "step": 3619 + }, + { + "epoch": 1.3363278365739601, + "grad_norm": 0.8829731941223145, + "learning_rate": 4.4363388718301044e-06, + "loss": 0.7247, + "step": 3620 + }, + { + "epoch": 1.3366970219049963, + "grad_norm": 0.8960434198379517, + "learning_rate": 4.436031188295842e-06, + "loss": 0.6264, + "step": 3621 + }, + { + "epoch": 1.3370662072360324, + "grad_norm": 0.8759167790412903, + "learning_rate": 4.435723431483408e-06, + "loss": 0.6802, + "step": 3622 + }, + { + "epoch": 1.3374353925670688, + "grad_norm": 0.8516462445259094, + "learning_rate": 4.435415601404455e-06, + "loss": 0.6339, + "step": 3623 + }, + { + "epoch": 1.337804577898105, + "grad_norm": 0.886197030544281, + "learning_rate": 4.4351076980706306e-06, + "loss": 0.6806, + "step": 3624 + }, + { + "epoch": 1.338173763229141, + "grad_norm": 0.8860033750534058, + "learning_rate": 4.434799721493592e-06, + "loss": 0.6624, + "step": 3625 + }, + { + "epoch": 1.3385429485601772, + "grad_norm": 0.8891668319702148, + "learning_rate": 4.4344916716849935e-06, + "loss": 0.6682, + "step": 3626 + }, + { + "epoch": 1.3389121338912133, + "grad_norm": 0.8442513346672058, + "learning_rate": 4.434183548656496e-06, + "loss": 0.675, + "step": 3627 + }, + { + "epoch": 1.3392813192222497, + "grad_norm": 0.9233469367027283, + "learning_rate": 4.433875352419762e-06, + "loss": 0.6864, + "step": 3628 + }, + { + "epoch": 1.3396505045532858, + "grad_norm": 0.9088729619979858, + "learning_rate": 4.433567082986456e-06, + "loss": 0.6907, + "step": 3629 + }, + { + "epoch": 1.340019689884322, + "grad_norm": 0.8553927540779114, + "learning_rate": 4.433258740368246e-06, + "loss": 0.6607, + "step": 3630 + }, + { + "epoch": 1.340388875215358, + "grad_norm": 0.84653639793396, + "learning_rate": 4.432950324576803e-06, + "loss": 0.6351, + "step": 3631 + }, + { + "epoch": 1.3407580605463942, + "grad_norm": 0.866902768611908, + "learning_rate": 4.432641835623801e-06, + "loss": 0.6462, + "step": 3632 + }, + { + "epoch": 1.3411272458774306, + "grad_norm": 0.9056516885757446, + "learning_rate": 4.432333273520914e-06, + "loss": 0.669, + "step": 3633 + }, + { + "epoch": 1.3414964312084667, + "grad_norm": 0.8655253648757935, + "learning_rate": 4.432024638279823e-06, + "loss": 0.6865, + "step": 3634 + }, + { + "epoch": 1.3418656165395029, + "grad_norm": 0.851672887802124, + "learning_rate": 4.4317159299122095e-06, + "loss": 0.6688, + "step": 3635 + }, + { + "epoch": 1.342234801870539, + "grad_norm": 0.8888675570487976, + "learning_rate": 4.431407148429758e-06, + "loss": 0.6871, + "step": 3636 + }, + { + "epoch": 1.3426039872015751, + "grad_norm": 0.8496171832084656, + "learning_rate": 4.431098293844155e-06, + "loss": 0.6548, + "step": 3637 + }, + { + "epoch": 1.3429731725326115, + "grad_norm": 0.8836626410484314, + "learning_rate": 4.4307893661670906e-06, + "loss": 0.6418, + "step": 3638 + }, + { + "epoch": 1.3433423578636474, + "grad_norm": 0.8799203634262085, + "learning_rate": 4.430480365410259e-06, + "loss": 0.634, + "step": 3639 + }, + { + "epoch": 1.3437115431946838, + "grad_norm": 0.8665536642074585, + "learning_rate": 4.4301712915853545e-06, + "loss": 0.6476, + "step": 3640 + }, + { + "epoch": 1.34408072852572, + "grad_norm": 0.8701696395874023, + "learning_rate": 4.429862144704075e-06, + "loss": 0.6704, + "step": 3641 + }, + { + "epoch": 1.344449913856756, + "grad_norm": 0.8727666139602661, + "learning_rate": 4.429552924778124e-06, + "loss": 0.6939, + "step": 3642 + }, + { + "epoch": 1.3448190991877922, + "grad_norm": 0.8730077147483826, + "learning_rate": 4.429243631819202e-06, + "loss": 0.6502, + "step": 3643 + }, + { + "epoch": 1.3451882845188283, + "grad_norm": 0.8855935335159302, + "learning_rate": 4.4289342658390185e-06, + "loss": 0.6803, + "step": 3644 + }, + { + "epoch": 1.3455574698498647, + "grad_norm": 0.887362539768219, + "learning_rate": 4.428624826849281e-06, + "loss": 0.6665, + "step": 3645 + }, + { + "epoch": 1.3459266551809008, + "grad_norm": 0.8773642182350159, + "learning_rate": 4.428315314861704e-06, + "loss": 0.6932, + "step": 3646 + }, + { + "epoch": 1.346295840511937, + "grad_norm": 0.9010303616523743, + "learning_rate": 4.428005729888e-06, + "loss": 0.6686, + "step": 3647 + }, + { + "epoch": 1.346665025842973, + "grad_norm": 0.8826807737350464, + "learning_rate": 4.427696071939888e-06, + "loss": 0.6777, + "step": 3648 + }, + { + "epoch": 1.3470342111740092, + "grad_norm": 0.8741413950920105, + "learning_rate": 4.427386341029088e-06, + "loss": 0.672, + "step": 3649 + }, + { + "epoch": 1.3474033965050456, + "grad_norm": 0.858264684677124, + "learning_rate": 4.427076537167322e-06, + "loss": 0.6234, + "step": 3650 + }, + { + "epoch": 1.3477725818360817, + "grad_norm": 0.8989274501800537, + "learning_rate": 4.426766660366318e-06, + "loss": 0.6908, + "step": 3651 + }, + { + "epoch": 1.3481417671671179, + "grad_norm": 0.8839186429977417, + "learning_rate": 4.4264567106378046e-06, + "loss": 0.7072, + "step": 3652 + }, + { + "epoch": 1.348510952498154, + "grad_norm": 0.898168683052063, + "learning_rate": 4.426146687993512e-06, + "loss": 0.6946, + "step": 3653 + }, + { + "epoch": 1.3488801378291901, + "grad_norm": 0.9054598212242126, + "learning_rate": 4.425836592445177e-06, + "loss": 0.6696, + "step": 3654 + }, + { + "epoch": 1.3492493231602265, + "grad_norm": 0.8900846838951111, + "learning_rate": 4.425526424004533e-06, + "loss": 0.6787, + "step": 3655 + }, + { + "epoch": 1.3496185084912626, + "grad_norm": 0.8679911494255066, + "learning_rate": 4.425216182683321e-06, + "loss": 0.687, + "step": 3656 + }, + { + "epoch": 1.3499876938222988, + "grad_norm": 0.8488673567771912, + "learning_rate": 4.424905868493285e-06, + "loss": 0.6933, + "step": 3657 + }, + { + "epoch": 1.350356879153335, + "grad_norm": 0.8825292587280273, + "learning_rate": 4.4245954814461695e-06, + "loss": 0.6351, + "step": 3658 + }, + { + "epoch": 1.350726064484371, + "grad_norm": 0.9108297228813171, + "learning_rate": 4.424285021553722e-06, + "loss": 0.6835, + "step": 3659 + }, + { + "epoch": 1.3510952498154074, + "grad_norm": 0.8353254199028015, + "learning_rate": 4.4239744888276945e-06, + "loss": 0.6821, + "step": 3660 + }, + { + "epoch": 1.3514644351464435, + "grad_norm": 0.8671820163726807, + "learning_rate": 4.423663883279839e-06, + "loss": 0.6356, + "step": 3661 + }, + { + "epoch": 1.3518336204774797, + "grad_norm": 0.8462734222412109, + "learning_rate": 4.423353204921913e-06, + "loss": 0.6858, + "step": 3662 + }, + { + "epoch": 1.3522028058085158, + "grad_norm": 0.8816967010498047, + "learning_rate": 4.423042453765675e-06, + "loss": 0.6382, + "step": 3663 + }, + { + "epoch": 1.352571991139552, + "grad_norm": 0.8489300608634949, + "learning_rate": 4.422731629822887e-06, + "loss": 0.6922, + "step": 3664 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 0.8671981692314148, + "learning_rate": 4.422420733105314e-06, + "loss": 0.6283, + "step": 3665 + }, + { + "epoch": 1.3533103618016245, + "grad_norm": 0.8711367249488831, + "learning_rate": 4.4221097636247236e-06, + "loss": 0.6816, + "step": 3666 + }, + { + "epoch": 1.3536795471326606, + "grad_norm": 0.8688810467720032, + "learning_rate": 4.421798721392884e-06, + "loss": 0.6345, + "step": 3667 + }, + { + "epoch": 1.3540487324636967, + "grad_norm": 0.8804559707641602, + "learning_rate": 4.421487606421569e-06, + "loss": 0.6637, + "step": 3668 + }, + { + "epoch": 1.3544179177947329, + "grad_norm": 0.8601940870285034, + "learning_rate": 4.421176418722556e-06, + "loss": 0.6907, + "step": 3669 + }, + { + "epoch": 1.3547871031257692, + "grad_norm": 0.876093327999115, + "learning_rate": 4.4208651583076215e-06, + "loss": 0.6716, + "step": 3670 + }, + { + "epoch": 1.3551562884568054, + "grad_norm": 0.8793753981590271, + "learning_rate": 4.420553825188546e-06, + "loss": 0.6462, + "step": 3671 + }, + { + "epoch": 1.3555254737878415, + "grad_norm": 0.8810238838195801, + "learning_rate": 4.4202424193771155e-06, + "loss": 0.6346, + "step": 3672 + }, + { + "epoch": 1.3558946591188776, + "grad_norm": 0.8736618161201477, + "learning_rate": 4.419930940885115e-06, + "loss": 0.6914, + "step": 3673 + }, + { + "epoch": 1.3562638444499138, + "grad_norm": 0.888463020324707, + "learning_rate": 4.419619389724334e-06, + "loss": 0.7089, + "step": 3674 + }, + { + "epoch": 1.3566330297809501, + "grad_norm": 0.8611968159675598, + "learning_rate": 4.419307765906566e-06, + "loss": 0.6586, + "step": 3675 + }, + { + "epoch": 1.3570022151119863, + "grad_norm": 0.8408264517784119, + "learning_rate": 4.418996069443604e-06, + "loss": 0.6875, + "step": 3676 + }, + { + "epoch": 1.3573714004430224, + "grad_norm": 0.8582924008369446, + "learning_rate": 4.418684300347247e-06, + "loss": 0.6319, + "step": 3677 + }, + { + "epoch": 1.3577405857740585, + "grad_norm": 0.9110267758369446, + "learning_rate": 4.418372458629294e-06, + "loss": 0.6719, + "step": 3678 + }, + { + "epoch": 1.3581097711050947, + "grad_norm": 0.8936803936958313, + "learning_rate": 4.418060544301549e-06, + "loss": 0.6612, + "step": 3679 + }, + { + "epoch": 1.358478956436131, + "grad_norm": 0.8893243074417114, + "learning_rate": 4.417748557375818e-06, + "loss": 0.6577, + "step": 3680 + }, + { + "epoch": 1.3588481417671672, + "grad_norm": 0.8780907392501831, + "learning_rate": 4.417436497863909e-06, + "loss": 0.6814, + "step": 3681 + }, + { + "epoch": 1.3592173270982033, + "grad_norm": 0.8801484107971191, + "learning_rate": 4.4171243657776345e-06, + "loss": 0.6586, + "step": 3682 + }, + { + "epoch": 1.3595865124292394, + "grad_norm": 0.9020469188690186, + "learning_rate": 4.416812161128808e-06, + "loss": 0.6576, + "step": 3683 + }, + { + "epoch": 1.3599556977602756, + "grad_norm": 0.8510473370552063, + "learning_rate": 4.416499883929245e-06, + "loss": 0.6934, + "step": 3684 + }, + { + "epoch": 1.360324883091312, + "grad_norm": 0.861465334892273, + "learning_rate": 4.416187534190767e-06, + "loss": 0.6628, + "step": 3685 + }, + { + "epoch": 1.360694068422348, + "grad_norm": 0.9427391886711121, + "learning_rate": 4.415875111925195e-06, + "loss": 0.6865, + "step": 3686 + }, + { + "epoch": 1.3610632537533842, + "grad_norm": 0.8742334842681885, + "learning_rate": 4.415562617144355e-06, + "loss": 0.6598, + "step": 3687 + }, + { + "epoch": 1.3614324390844204, + "grad_norm": 0.8633026480674744, + "learning_rate": 4.415250049860074e-06, + "loss": 0.6305, + "step": 3688 + }, + { + "epoch": 1.3618016244154565, + "grad_norm": 0.8614804148674011, + "learning_rate": 4.414937410084184e-06, + "loss": 0.649, + "step": 3689 + }, + { + "epoch": 1.3621708097464928, + "grad_norm": 0.8729840517044067, + "learning_rate": 4.414624697828517e-06, + "loss": 0.6478, + "step": 3690 + }, + { + "epoch": 1.362539995077529, + "grad_norm": 0.8832775950431824, + "learning_rate": 4.41431191310491e-06, + "loss": 0.7047, + "step": 3691 + }, + { + "epoch": 1.3629091804085651, + "grad_norm": 0.9133788347244263, + "learning_rate": 4.4139990559252e-06, + "loss": 0.6781, + "step": 3692 + }, + { + "epoch": 1.3632783657396013, + "grad_norm": 0.911825954914093, + "learning_rate": 4.41368612630123e-06, + "loss": 0.6881, + "step": 3693 + }, + { + "epoch": 1.3636475510706374, + "grad_norm": 0.8695847392082214, + "learning_rate": 4.4133731242448445e-06, + "loss": 0.6216, + "step": 3694 + }, + { + "epoch": 1.3640167364016738, + "grad_norm": 0.8506943583488464, + "learning_rate": 4.41306004976789e-06, + "loss": 0.676, + "step": 3695 + }, + { + "epoch": 1.36438592173271, + "grad_norm": 0.8750126361846924, + "learning_rate": 4.412746902882216e-06, + "loss": 0.6677, + "step": 3696 + }, + { + "epoch": 1.364755107063746, + "grad_norm": 0.8811710476875305, + "learning_rate": 4.412433683599677e-06, + "loss": 0.6251, + "step": 3697 + }, + { + "epoch": 1.3651242923947822, + "grad_norm": 0.8517153263092041, + "learning_rate": 4.4121203919321255e-06, + "loss": 0.6412, + "step": 3698 + }, + { + "epoch": 1.3654934777258183, + "grad_norm": 0.8589240312576294, + "learning_rate": 4.41180702789142e-06, + "loss": 0.6708, + "step": 3699 + }, + { + "epoch": 1.3658626630568547, + "grad_norm": 0.875397801399231, + "learning_rate": 4.4114935914894234e-06, + "loss": 0.6765, + "step": 3700 + }, + { + "epoch": 1.3662318483878908, + "grad_norm": 0.8999257683753967, + "learning_rate": 4.411180082737997e-06, + "loss": 0.6551, + "step": 3701 + }, + { + "epoch": 1.366601033718927, + "grad_norm": 0.8827310800552368, + "learning_rate": 4.410866501649007e-06, + "loss": 0.6554, + "step": 3702 + }, + { + "epoch": 1.366970219049963, + "grad_norm": 0.8513714075088501, + "learning_rate": 4.410552848234324e-06, + "loss": 0.6648, + "step": 3703 + }, + { + "epoch": 1.3673394043809992, + "grad_norm": 0.887485146522522, + "learning_rate": 4.410239122505818e-06, + "loss": 0.6815, + "step": 3704 + }, + { + "epoch": 1.3677085897120356, + "grad_norm": 0.9034040570259094, + "learning_rate": 4.409925324475364e-06, + "loss": 0.6747, + "step": 3705 + }, + { + "epoch": 1.3680777750430715, + "grad_norm": 0.8613736629486084, + "learning_rate": 4.40961145415484e-06, + "loss": 0.6462, + "step": 3706 + }, + { + "epoch": 1.3684469603741078, + "grad_norm": 0.8886554837226868, + "learning_rate": 4.409297511556124e-06, + "loss": 0.6954, + "step": 3707 + }, + { + "epoch": 1.368816145705144, + "grad_norm": 0.9355406165122986, + "learning_rate": 4.408983496691101e-06, + "loss": 0.6164, + "step": 3708 + }, + { + "epoch": 1.3691853310361801, + "grad_norm": 0.8391455411911011, + "learning_rate": 4.408669409571654e-06, + "loss": 0.6395, + "step": 3709 + }, + { + "epoch": 1.3695545163672165, + "grad_norm": 0.8887644410133362, + "learning_rate": 4.408355250209673e-06, + "loss": 0.6352, + "step": 3710 + }, + { + "epoch": 1.3699237016982524, + "grad_norm": 0.8630105257034302, + "learning_rate": 4.408041018617048e-06, + "loss": 0.6537, + "step": 3711 + }, + { + "epoch": 1.3702928870292888, + "grad_norm": 0.9227706789970398, + "learning_rate": 4.407726714805673e-06, + "loss": 0.6814, + "step": 3712 + }, + { + "epoch": 1.370662072360325, + "grad_norm": 0.929074227809906, + "learning_rate": 4.4074123387874425e-06, + "loss": 0.6753, + "step": 3713 + }, + { + "epoch": 1.371031257691361, + "grad_norm": 0.8738381862640381, + "learning_rate": 4.407097890574258e-06, + "loss": 0.6406, + "step": 3714 + }, + { + "epoch": 1.3714004430223972, + "grad_norm": 0.8701595664024353, + "learning_rate": 4.40678337017802e-06, + "loss": 0.6695, + "step": 3715 + }, + { + "epoch": 1.3717696283534333, + "grad_norm": 0.868579089641571, + "learning_rate": 4.406468777610634e-06, + "loss": 0.6619, + "step": 3716 + }, + { + "epoch": 1.3721388136844697, + "grad_norm": 0.8897202610969543, + "learning_rate": 4.406154112884006e-06, + "loss": 0.6692, + "step": 3717 + }, + { + "epoch": 1.3725079990155058, + "grad_norm": 0.869539201259613, + "learning_rate": 4.405839376010046e-06, + "loss": 0.666, + "step": 3718 + }, + { + "epoch": 1.372877184346542, + "grad_norm": 0.8648926019668579, + "learning_rate": 4.405524567000667e-06, + "loss": 0.6646, + "step": 3719 + }, + { + "epoch": 1.373246369677578, + "grad_norm": 0.8612492084503174, + "learning_rate": 4.405209685867785e-06, + "loss": 0.6736, + "step": 3720 + }, + { + "epoch": 1.3736155550086142, + "grad_norm": 0.9043440818786621, + "learning_rate": 4.404894732623317e-06, + "loss": 0.663, + "step": 3721 + }, + { + "epoch": 1.3739847403396506, + "grad_norm": 0.8529611825942993, + "learning_rate": 4.404579707279185e-06, + "loss": 0.6693, + "step": 3722 + }, + { + "epoch": 1.3743539256706867, + "grad_norm": 0.8777966499328613, + "learning_rate": 4.404264609847313e-06, + "loss": 0.6558, + "step": 3723 + }, + { + "epoch": 1.3747231110017228, + "grad_norm": 0.8846269249916077, + "learning_rate": 4.4039494403396256e-06, + "loss": 0.6792, + "step": 3724 + }, + { + "epoch": 1.375092296332759, + "grad_norm": 0.8953638076782227, + "learning_rate": 4.403634198768053e-06, + "loss": 0.6801, + "step": 3725 + }, + { + "epoch": 1.3754614816637951, + "grad_norm": 0.8704994916915894, + "learning_rate": 4.403318885144527e-06, + "loss": 0.6922, + "step": 3726 + }, + { + "epoch": 1.3758306669948315, + "grad_norm": 0.9043406248092651, + "learning_rate": 4.403003499480981e-06, + "loss": 0.6947, + "step": 3727 + }, + { + "epoch": 1.3761998523258676, + "grad_norm": 0.8930913805961609, + "learning_rate": 4.402688041789354e-06, + "loss": 0.6609, + "step": 3728 + }, + { + "epoch": 1.3765690376569037, + "grad_norm": 0.842339277267456, + "learning_rate": 4.402372512081585e-06, + "loss": 0.682, + "step": 3729 + }, + { + "epoch": 1.3769382229879399, + "grad_norm": 0.8825166821479797, + "learning_rate": 4.402056910369617e-06, + "loss": 0.6848, + "step": 3730 + }, + { + "epoch": 1.377307408318976, + "grad_norm": 0.9247344732284546, + "learning_rate": 4.401741236665394e-06, + "loss": 0.655, + "step": 3731 + }, + { + "epoch": 1.3776765936500124, + "grad_norm": 0.8742569088935852, + "learning_rate": 4.401425490980867e-06, + "loss": 0.6714, + "step": 3732 + }, + { + "epoch": 1.3780457789810485, + "grad_norm": 0.879940927028656, + "learning_rate": 4.401109673327985e-06, + "loss": 0.6581, + "step": 3733 + }, + { + "epoch": 1.3784149643120847, + "grad_norm": 0.8889074921607971, + "learning_rate": 4.400793783718701e-06, + "loss": 0.6582, + "step": 3734 + }, + { + "epoch": 1.3787841496431208, + "grad_norm": 0.8838909864425659, + "learning_rate": 4.400477822164972e-06, + "loss": 0.6731, + "step": 3735 + }, + { + "epoch": 1.379153334974157, + "grad_norm": 0.8670485019683838, + "learning_rate": 4.4001617886787585e-06, + "loss": 0.6692, + "step": 3736 + }, + { + "epoch": 1.3795225203051933, + "grad_norm": 0.8991731405258179, + "learning_rate": 4.399845683272021e-06, + "loss": 0.6482, + "step": 3737 + }, + { + "epoch": 1.3798917056362294, + "grad_norm": 0.8649955987930298, + "learning_rate": 4.3995295059567235e-06, + "loss": 0.664, + "step": 3738 + }, + { + "epoch": 1.3802608909672656, + "grad_norm": 0.8813152313232422, + "learning_rate": 4.399213256744833e-06, + "loss": 0.6792, + "step": 3739 + }, + { + "epoch": 1.3806300762983017, + "grad_norm": 0.8895173668861389, + "learning_rate": 4.398896935648321e-06, + "loss": 0.7113, + "step": 3740 + }, + { + "epoch": 1.3809992616293378, + "grad_norm": 0.8955971002578735, + "learning_rate": 4.398580542679159e-06, + "loss": 0.6698, + "step": 3741 + }, + { + "epoch": 1.3813684469603742, + "grad_norm": 0.8534330725669861, + "learning_rate": 4.398264077849322e-06, + "loss": 0.6842, + "step": 3742 + }, + { + "epoch": 1.3817376322914103, + "grad_norm": 0.8854157328605652, + "learning_rate": 4.397947541170789e-06, + "loss": 0.6808, + "step": 3743 + }, + { + "epoch": 1.3821068176224465, + "grad_norm": 0.8862353563308716, + "learning_rate": 4.397630932655542e-06, + "loss": 0.6899, + "step": 3744 + }, + { + "epoch": 1.3824760029534826, + "grad_norm": 0.8985684514045715, + "learning_rate": 4.397314252315562e-06, + "loss": 0.6345, + "step": 3745 + }, + { + "epoch": 1.3828451882845187, + "grad_norm": 0.8654945492744446, + "learning_rate": 4.396997500162836e-06, + "loss": 0.6478, + "step": 3746 + }, + { + "epoch": 1.383214373615555, + "grad_norm": 0.8555692434310913, + "learning_rate": 4.396680676209354e-06, + "loss": 0.667, + "step": 3747 + }, + { + "epoch": 1.3835835589465912, + "grad_norm": 0.9078381657600403, + "learning_rate": 4.396363780467107e-06, + "loss": 0.6558, + "step": 3748 + }, + { + "epoch": 1.3839527442776274, + "grad_norm": 0.8400607109069824, + "learning_rate": 4.396046812948089e-06, + "loss": 0.6374, + "step": 3749 + }, + { + "epoch": 1.3843219296086635, + "grad_norm": 0.853996753692627, + "learning_rate": 4.395729773664298e-06, + "loss": 0.6766, + "step": 3750 + }, + { + "epoch": 1.3846911149396997, + "grad_norm": 0.908796489238739, + "learning_rate": 4.395412662627733e-06, + "loss": 0.6315, + "step": 3751 + }, + { + "epoch": 1.385060300270736, + "grad_norm": 0.8804665803909302, + "learning_rate": 4.395095479850396e-06, + "loss": 0.6438, + "step": 3752 + }, + { + "epoch": 1.3854294856017721, + "grad_norm": 0.8495657444000244, + "learning_rate": 4.394778225344295e-06, + "loss": 0.6936, + "step": 3753 + }, + { + "epoch": 1.3857986709328083, + "grad_norm": 0.8822701573371887, + "learning_rate": 4.3944608991214355e-06, + "loss": 0.6653, + "step": 3754 + }, + { + "epoch": 1.3861678562638444, + "grad_norm": 0.8866490125656128, + "learning_rate": 4.394143501193828e-06, + "loss": 0.6938, + "step": 3755 + }, + { + "epoch": 1.3865370415948806, + "grad_norm": 0.9411451816558838, + "learning_rate": 4.393826031573488e-06, + "loss": 0.6705, + "step": 3756 + }, + { + "epoch": 1.386906226925917, + "grad_norm": 0.8738088607788086, + "learning_rate": 4.39350849027243e-06, + "loss": 0.6709, + "step": 3757 + }, + { + "epoch": 1.387275412256953, + "grad_norm": 0.8573096394538879, + "learning_rate": 4.393190877302673e-06, + "loss": 0.6737, + "step": 3758 + }, + { + "epoch": 1.3876445975879892, + "grad_norm": 0.8690365552902222, + "learning_rate": 4.392873192676239e-06, + "loss": 0.6412, + "step": 3759 + }, + { + "epoch": 1.3880137829190253, + "grad_norm": 0.8553135991096497, + "learning_rate": 4.392555436405152e-06, + "loss": 0.6358, + "step": 3760 + }, + { + "epoch": 1.3883829682500615, + "grad_norm": 0.8620362281799316, + "learning_rate": 4.392237608501439e-06, + "loss": 0.6766, + "step": 3761 + }, + { + "epoch": 1.3887521535810978, + "grad_norm": 0.8985918760299683, + "learning_rate": 4.39191970897713e-06, + "loss": 0.6581, + "step": 3762 + }, + { + "epoch": 1.389121338912134, + "grad_norm": 0.8957574963569641, + "learning_rate": 4.391601737844257e-06, + "loss": 0.6535, + "step": 3763 + }, + { + "epoch": 1.38949052424317, + "grad_norm": 0.8814603686332703, + "learning_rate": 4.391283695114855e-06, + "loss": 0.6335, + "step": 3764 + }, + { + "epoch": 1.3898597095742062, + "grad_norm": 0.8887036442756653, + "learning_rate": 4.390965580800962e-06, + "loss": 0.6473, + "step": 3765 + }, + { + "epoch": 1.3902288949052424, + "grad_norm": 0.8929142355918884, + "learning_rate": 4.390647394914618e-06, + "loss": 0.7101, + "step": 3766 + }, + { + "epoch": 1.3905980802362787, + "grad_norm": 0.9117453694343567, + "learning_rate": 4.390329137467868e-06, + "loss": 0.693, + "step": 3767 + }, + { + "epoch": 1.3909672655673149, + "grad_norm": 0.8983461260795593, + "learning_rate": 4.390010808472757e-06, + "loss": 0.6278, + "step": 3768 + }, + { + "epoch": 1.391336450898351, + "grad_norm": 0.8711938261985779, + "learning_rate": 4.389692407941332e-06, + "loss": 0.6606, + "step": 3769 + }, + { + "epoch": 1.3917056362293871, + "grad_norm": 0.8953997492790222, + "learning_rate": 4.3893739358856465e-06, + "loss": 0.6768, + "step": 3770 + }, + { + "epoch": 1.3920748215604233, + "grad_norm": 0.8780116438865662, + "learning_rate": 4.389055392317753e-06, + "loss": 0.6285, + "step": 3771 + }, + { + "epoch": 1.3924440068914596, + "grad_norm": 0.8701308965682983, + "learning_rate": 4.38873677724971e-06, + "loss": 0.6658, + "step": 3772 + }, + { + "epoch": 1.3928131922224956, + "grad_norm": 0.8776068687438965, + "learning_rate": 4.3884180906935755e-06, + "loss": 0.6434, + "step": 3773 + }, + { + "epoch": 1.393182377553532, + "grad_norm": 0.8818007707595825, + "learning_rate": 4.388099332661413e-06, + "loss": 0.6661, + "step": 3774 + }, + { + "epoch": 1.393551562884568, + "grad_norm": 0.8859515190124512, + "learning_rate": 4.3877805031652865e-06, + "loss": 0.6862, + "step": 3775 + }, + { + "epoch": 1.3939207482156042, + "grad_norm": 0.8969332575798035, + "learning_rate": 4.387461602217263e-06, + "loss": 0.6876, + "step": 3776 + }, + { + "epoch": 1.3942899335466405, + "grad_norm": 0.8645728230476379, + "learning_rate": 4.387142629829415e-06, + "loss": 0.6402, + "step": 3777 + }, + { + "epoch": 1.3946591188776765, + "grad_norm": 0.8761599063873291, + "learning_rate": 4.3868235860138124e-06, + "loss": 0.7227, + "step": 3778 + }, + { + "epoch": 1.3950283042087128, + "grad_norm": 0.8785859942436218, + "learning_rate": 4.386504470782534e-06, + "loss": 0.6745, + "step": 3779 + }, + { + "epoch": 1.395397489539749, + "grad_norm": 0.8668728470802307, + "learning_rate": 4.386185284147656e-06, + "loss": 0.668, + "step": 3780 + }, + { + "epoch": 1.395766674870785, + "grad_norm": 0.8456920981407166, + "learning_rate": 4.3858660261212596e-06, + "loss": 0.649, + "step": 3781 + }, + { + "epoch": 1.3961358602018212, + "grad_norm": 0.865300714969635, + "learning_rate": 4.38554669671543e-06, + "loss": 0.6441, + "step": 3782 + }, + { + "epoch": 1.3965050455328574, + "grad_norm": 0.8949540257453918, + "learning_rate": 4.385227295942253e-06, + "loss": 0.6572, + "step": 3783 + }, + { + "epoch": 1.3968742308638937, + "grad_norm": 0.891494870185852, + "learning_rate": 4.3849078238138185e-06, + "loss": 0.6599, + "step": 3784 + }, + { + "epoch": 1.3972434161949299, + "grad_norm": 0.8542261123657227, + "learning_rate": 4.384588280342217e-06, + "loss": 0.6666, + "step": 3785 + }, + { + "epoch": 1.397612601525966, + "grad_norm": 0.8925033807754517, + "learning_rate": 4.384268665539544e-06, + "loss": 0.6631, + "step": 3786 + }, + { + "epoch": 1.3979817868570021, + "grad_norm": 0.8842124938964844, + "learning_rate": 4.383948979417897e-06, + "loss": 0.6415, + "step": 3787 + }, + { + "epoch": 1.3983509721880383, + "grad_norm": 0.9059325456619263, + "learning_rate": 4.383629221989375e-06, + "loss": 0.6579, + "step": 3788 + }, + { + "epoch": 1.3987201575190746, + "grad_norm": 0.8651158213615417, + "learning_rate": 4.3833093932660815e-06, + "loss": 0.6732, + "step": 3789 + }, + { + "epoch": 1.3990893428501108, + "grad_norm": 0.9320131540298462, + "learning_rate": 4.382989493260122e-06, + "loss": 0.6708, + "step": 3790 + }, + { + "epoch": 1.399458528181147, + "grad_norm": 0.8735993504524231, + "learning_rate": 4.3826695219836054e-06, + "loss": 0.7188, + "step": 3791 + }, + { + "epoch": 1.399827713512183, + "grad_norm": 0.8637703657150269, + "learning_rate": 4.382349479448641e-06, + "loss": 0.669, + "step": 3792 + }, + { + "epoch": 1.4001968988432192, + "grad_norm": 0.8687921166419983, + "learning_rate": 4.382029365667343e-06, + "loss": 0.6888, + "step": 3793 + }, + { + "epoch": 1.4005660841742555, + "grad_norm": 0.9096945524215698, + "learning_rate": 4.381709180651827e-06, + "loss": 0.6698, + "step": 3794 + }, + { + "epoch": 1.4009352695052917, + "grad_norm": 0.8404715657234192, + "learning_rate": 4.381388924414213e-06, + "loss": 0.6785, + "step": 3795 + }, + { + "epoch": 1.4013044548363278, + "grad_norm": 0.863408088684082, + "learning_rate": 4.381068596966621e-06, + "loss": 0.6777, + "step": 3796 + }, + { + "epoch": 1.401673640167364, + "grad_norm": 0.8809120059013367, + "learning_rate": 4.380748198321177e-06, + "loss": 0.6374, + "step": 3797 + }, + { + "epoch": 1.4020428254984, + "grad_norm": 0.9038072228431702, + "learning_rate": 4.380427728490007e-06, + "loss": 0.6697, + "step": 3798 + }, + { + "epoch": 1.4024120108294365, + "grad_norm": 0.8752264976501465, + "learning_rate": 4.380107187485241e-06, + "loss": 0.6653, + "step": 3799 + }, + { + "epoch": 1.4027811961604726, + "grad_norm": 0.9201210737228394, + "learning_rate": 4.379786575319011e-06, + "loss": 0.6801, + "step": 3800 + }, + { + "epoch": 1.4031503814915087, + "grad_norm": 0.910210132598877, + "learning_rate": 4.379465892003453e-06, + "loss": 0.6625, + "step": 3801 + }, + { + "epoch": 1.4035195668225449, + "grad_norm": 0.9096465110778809, + "learning_rate": 4.379145137550704e-06, + "loss": 0.6612, + "step": 3802 + }, + { + "epoch": 1.403888752153581, + "grad_norm": 0.9016884565353394, + "learning_rate": 4.378824311972905e-06, + "loss": 0.6153, + "step": 3803 + }, + { + "epoch": 1.4042579374846174, + "grad_norm": 0.8370362520217896, + "learning_rate": 4.378503415282198e-06, + "loss": 0.6899, + "step": 3804 + }, + { + "epoch": 1.4046271228156535, + "grad_norm": 0.8975310325622559, + "learning_rate": 4.37818244749073e-06, + "loss": 0.6718, + "step": 3805 + }, + { + "epoch": 1.4049963081466896, + "grad_norm": 0.8600775599479675, + "learning_rate": 4.377861408610649e-06, + "loss": 0.6272, + "step": 3806 + }, + { + "epoch": 1.4053654934777258, + "grad_norm": 0.8517125844955444, + "learning_rate": 4.377540298654106e-06, + "loss": 0.6556, + "step": 3807 + }, + { + "epoch": 1.405734678808762, + "grad_norm": 0.8559024930000305, + "learning_rate": 4.377219117633256e-06, + "loss": 0.6159, + "step": 3808 + }, + { + "epoch": 1.4061038641397983, + "grad_norm": 0.8222905397415161, + "learning_rate": 4.3768978655602544e-06, + "loss": 0.6329, + "step": 3809 + }, + { + "epoch": 1.4064730494708344, + "grad_norm": 0.8658284544944763, + "learning_rate": 4.376576542447262e-06, + "loss": 0.6388, + "step": 3810 + }, + { + "epoch": 1.4068422348018705, + "grad_norm": 0.8767262697219849, + "learning_rate": 4.376255148306438e-06, + "loss": 0.6495, + "step": 3811 + }, + { + "epoch": 1.4072114201329067, + "grad_norm": 0.8655863404273987, + "learning_rate": 4.37593368314995e-06, + "loss": 0.6727, + "step": 3812 + }, + { + "epoch": 1.4075806054639428, + "grad_norm": 0.8878896236419678, + "learning_rate": 4.375612146989964e-06, + "loss": 0.6315, + "step": 3813 + }, + { + "epoch": 1.4079497907949792, + "grad_norm": 0.8787865042686462, + "learning_rate": 4.375290539838649e-06, + "loss": 0.6944, + "step": 3814 + }, + { + "epoch": 1.4083189761260153, + "grad_norm": 0.8709694147109985, + "learning_rate": 4.37496886170818e-06, + "loss": 0.6472, + "step": 3815 + }, + { + "epoch": 1.4086881614570514, + "grad_norm": 0.8848674297332764, + "learning_rate": 4.374647112610731e-06, + "loss": 0.6463, + "step": 3816 + }, + { + "epoch": 1.4090573467880876, + "grad_norm": 0.8557237982749939, + "learning_rate": 4.3743252925584815e-06, + "loss": 0.707, + "step": 3817 + }, + { + "epoch": 1.4094265321191237, + "grad_norm": 0.8707484006881714, + "learning_rate": 4.37400340156361e-06, + "loss": 0.6616, + "step": 3818 + }, + { + "epoch": 1.40979571745016, + "grad_norm": 0.8850047588348389, + "learning_rate": 4.373681439638301e-06, + "loss": 0.6424, + "step": 3819 + }, + { + "epoch": 1.4101649027811962, + "grad_norm": 0.847669243812561, + "learning_rate": 4.373359406794741e-06, + "loss": 0.6747, + "step": 3820 + }, + { + "epoch": 1.4105340881122324, + "grad_norm": 0.8728204965591431, + "learning_rate": 4.3730373030451195e-06, + "loss": 0.7041, + "step": 3821 + }, + { + "epoch": 1.4109032734432685, + "grad_norm": 0.8460925221443176, + "learning_rate": 4.372715128401627e-06, + "loss": 0.6638, + "step": 3822 + }, + { + "epoch": 1.4112724587743046, + "grad_norm": 0.8931689858436584, + "learning_rate": 4.372392882876457e-06, + "loss": 0.6629, + "step": 3823 + }, + { + "epoch": 1.411641644105341, + "grad_norm": 0.8652756810188293, + "learning_rate": 4.372070566481809e-06, + "loss": 0.6296, + "step": 3824 + }, + { + "epoch": 1.4120108294363771, + "grad_norm": 0.8758707046508789, + "learning_rate": 4.37174817922988e-06, + "loss": 0.6609, + "step": 3825 + }, + { + "epoch": 1.4123800147674133, + "grad_norm": 0.9095095992088318, + "learning_rate": 4.371425721132872e-06, + "loss": 0.6524, + "step": 3826 + }, + { + "epoch": 1.4127492000984494, + "grad_norm": 0.8675584197044373, + "learning_rate": 4.371103192202993e-06, + "loss": 0.6331, + "step": 3827 + }, + { + "epoch": 1.4131183854294855, + "grad_norm": 0.8640998601913452, + "learning_rate": 4.370780592452447e-06, + "loss": 0.6521, + "step": 3828 + }, + { + "epoch": 1.413487570760522, + "grad_norm": 0.8786735534667969, + "learning_rate": 4.370457921893447e-06, + "loss": 0.6942, + "step": 3829 + }, + { + "epoch": 1.413856756091558, + "grad_norm": 0.8779584765434265, + "learning_rate": 4.370135180538206e-06, + "loss": 0.7027, + "step": 3830 + }, + { + "epoch": 1.4142259414225942, + "grad_norm": 0.8783898949623108, + "learning_rate": 4.3698123683989365e-06, + "loss": 0.6777, + "step": 3831 + }, + { + "epoch": 1.4145951267536303, + "grad_norm": 0.8926100730895996, + "learning_rate": 4.36948948548786e-06, + "loss": 0.6782, + "step": 3832 + }, + { + "epoch": 1.4149643120846664, + "grad_norm": 0.8497980833053589, + "learning_rate": 4.3691665318171965e-06, + "loss": 0.6559, + "step": 3833 + }, + { + "epoch": 1.4153334974157028, + "grad_norm": 0.8753690719604492, + "learning_rate": 4.36884350739917e-06, + "loss": 0.6504, + "step": 3834 + }, + { + "epoch": 1.415702682746739, + "grad_norm": 0.8620742559432983, + "learning_rate": 4.368520412246006e-06, + "loss": 0.686, + "step": 3835 + }, + { + "epoch": 1.416071868077775, + "grad_norm": 0.8733710050582886, + "learning_rate": 4.368197246369934e-06, + "loss": 0.6468, + "step": 3836 + }, + { + "epoch": 1.4164410534088112, + "grad_norm": 0.8658912777900696, + "learning_rate": 4.367874009783186e-06, + "loss": 0.6146, + "step": 3837 + }, + { + "epoch": 1.4168102387398473, + "grad_norm": 0.8583911657333374, + "learning_rate": 4.3675507024979965e-06, + "loss": 0.6509, + "step": 3838 + }, + { + "epoch": 1.4171794240708837, + "grad_norm": 0.8736206293106079, + "learning_rate": 4.367227324526603e-06, + "loss": 0.7105, + "step": 3839 + }, + { + "epoch": 1.4175486094019198, + "grad_norm": 0.8983967900276184, + "learning_rate": 4.366903875881243e-06, + "loss": 0.6848, + "step": 3840 + }, + { + "epoch": 1.417917794732956, + "grad_norm": 0.8642120361328125, + "learning_rate": 4.3665803565741614e-06, + "loss": 0.6602, + "step": 3841 + }, + { + "epoch": 1.4182869800639921, + "grad_norm": 0.890494167804718, + "learning_rate": 4.366256766617602e-06, + "loss": 0.6456, + "step": 3842 + }, + { + "epoch": 1.4186561653950283, + "grad_norm": 0.8886361122131348, + "learning_rate": 4.365933106023814e-06, + "loss": 0.7012, + "step": 3843 + }, + { + "epoch": 1.4190253507260646, + "grad_norm": 0.8562700748443604, + "learning_rate": 4.365609374805046e-06, + "loss": 0.6656, + "step": 3844 + }, + { + "epoch": 1.4193945360571005, + "grad_norm": 0.8894068598747253, + "learning_rate": 4.365285572973552e-06, + "loss": 0.6061, + "step": 3845 + }, + { + "epoch": 1.419763721388137, + "grad_norm": 0.8436412811279297, + "learning_rate": 4.364961700541588e-06, + "loss": 0.6117, + "step": 3846 + }, + { + "epoch": 1.420132906719173, + "grad_norm": 0.8923864960670471, + "learning_rate": 4.364637757521412e-06, + "loss": 0.688, + "step": 3847 + }, + { + "epoch": 1.4205020920502092, + "grad_norm": 0.9110100865364075, + "learning_rate": 4.3643137439252855e-06, + "loss": 0.6802, + "step": 3848 + }, + { + "epoch": 1.4208712773812455, + "grad_norm": 0.8883415460586548, + "learning_rate": 4.363989659765472e-06, + "loss": 0.6295, + "step": 3849 + }, + { + "epoch": 1.4212404627122814, + "grad_norm": 0.8657521605491638, + "learning_rate": 4.363665505054238e-06, + "loss": 0.624, + "step": 3850 + }, + { + "epoch": 1.4216096480433178, + "grad_norm": 0.8555691242218018, + "learning_rate": 4.3633412798038535e-06, + "loss": 0.6435, + "step": 3851 + }, + { + "epoch": 1.421978833374354, + "grad_norm": 0.830886960029602, + "learning_rate": 4.363016984026589e-06, + "loss": 0.6645, + "step": 3852 + }, + { + "epoch": 1.42234801870539, + "grad_norm": 0.867110550403595, + "learning_rate": 4.362692617734719e-06, + "loss": 0.6299, + "step": 3853 + }, + { + "epoch": 1.4227172040364262, + "grad_norm": 0.8683968186378479, + "learning_rate": 4.362368180940523e-06, + "loss": 0.6134, + "step": 3854 + }, + { + "epoch": 1.4230863893674623, + "grad_norm": 0.8312559723854065, + "learning_rate": 4.362043673656278e-06, + "loss": 0.6745, + "step": 3855 + }, + { + "epoch": 1.4234555746984987, + "grad_norm": 0.8871437907218933, + "learning_rate": 4.361719095894268e-06, + "loss": 0.655, + "step": 3856 + }, + { + "epoch": 1.4238247600295348, + "grad_norm": 0.8532933592796326, + "learning_rate": 4.361394447666778e-06, + "loss": 0.6517, + "step": 3857 + }, + { + "epoch": 1.424193945360571, + "grad_norm": 0.8708601593971252, + "learning_rate": 4.3610697289860945e-06, + "loss": 0.7104, + "step": 3858 + }, + { + "epoch": 1.4245631306916071, + "grad_norm": 0.8918480277061462, + "learning_rate": 4.36074493986451e-06, + "loss": 0.6493, + "step": 3859 + }, + { + "epoch": 1.4249323160226433, + "grad_norm": 0.851800262928009, + "learning_rate": 4.360420080314317e-06, + "loss": 0.6475, + "step": 3860 + }, + { + "epoch": 1.4253015013536796, + "grad_norm": 0.8531615138053894, + "learning_rate": 4.360095150347811e-06, + "loss": 0.6114, + "step": 3861 + }, + { + "epoch": 1.4256706866847157, + "grad_norm": 0.893435001373291, + "learning_rate": 4.359770149977291e-06, + "loss": 0.6722, + "step": 3862 + }, + { + "epoch": 1.4260398720157519, + "grad_norm": 0.8950505256652832, + "learning_rate": 4.3594450792150565e-06, + "loss": 0.6439, + "step": 3863 + }, + { + "epoch": 1.426409057346788, + "grad_norm": 0.8813532590866089, + "learning_rate": 4.359119938073413e-06, + "loss": 0.656, + "step": 3864 + }, + { + "epoch": 1.4267782426778242, + "grad_norm": 0.8569951057434082, + "learning_rate": 4.358794726564667e-06, + "loss": 0.648, + "step": 3865 + }, + { + "epoch": 1.4271474280088605, + "grad_norm": 0.8500365018844604, + "learning_rate": 4.358469444701127e-06, + "loss": 0.6473, + "step": 3866 + }, + { + "epoch": 1.4275166133398967, + "grad_norm": 0.8925997018814087, + "learning_rate": 4.358144092495106e-06, + "loss": 0.6533, + "step": 3867 + }, + { + "epoch": 1.4278857986709328, + "grad_norm": 0.8763107061386108, + "learning_rate": 4.357818669958916e-06, + "loss": 0.6532, + "step": 3868 + }, + { + "epoch": 1.428254984001969, + "grad_norm": 0.8858116865158081, + "learning_rate": 4.3574931771048766e-06, + "loss": 0.6621, + "step": 3869 + }, + { + "epoch": 1.428624169333005, + "grad_norm": 0.8684327006340027, + "learning_rate": 4.357167613945307e-06, + "loss": 0.68, + "step": 3870 + }, + { + "epoch": 1.4289933546640414, + "grad_norm": 0.8642259836196899, + "learning_rate": 4.356841980492528e-06, + "loss": 0.6706, + "step": 3871 + }, + { + "epoch": 1.4293625399950776, + "grad_norm": 0.8714734315872192, + "learning_rate": 4.356516276758868e-06, + "loss": 0.627, + "step": 3872 + }, + { + "epoch": 1.4297317253261137, + "grad_norm": 0.8607965707778931, + "learning_rate": 4.356190502756651e-06, + "loss": 0.6711, + "step": 3873 + }, + { + "epoch": 1.4301009106571498, + "grad_norm": 0.9022106528282166, + "learning_rate": 4.35586465849821e-06, + "loss": 0.6881, + "step": 3874 + }, + { + "epoch": 1.430470095988186, + "grad_norm": 0.8742704391479492, + "learning_rate": 4.3555387439958775e-06, + "loss": 0.6576, + "step": 3875 + }, + { + "epoch": 1.4308392813192223, + "grad_norm": 0.8614667654037476, + "learning_rate": 4.355212759261989e-06, + "loss": 0.6487, + "step": 3876 + }, + { + "epoch": 1.4312084666502585, + "grad_norm": 0.8625785708427429, + "learning_rate": 4.354886704308883e-06, + "loss": 0.6824, + "step": 3877 + }, + { + "epoch": 1.4315776519812946, + "grad_norm": 0.8671157360076904, + "learning_rate": 4.354560579148901e-06, + "loss": 0.642, + "step": 3878 + }, + { + "epoch": 1.4319468373123307, + "grad_norm": 0.884055495262146, + "learning_rate": 4.354234383794386e-06, + "loss": 0.6492, + "step": 3879 + }, + { + "epoch": 1.4323160226433669, + "grad_norm": 0.8620643019676208, + "learning_rate": 4.353908118257685e-06, + "loss": 0.6987, + "step": 3880 + }, + { + "epoch": 1.4326852079744032, + "grad_norm": 0.8969706892967224, + "learning_rate": 4.353581782551147e-06, + "loss": 0.6722, + "step": 3881 + }, + { + "epoch": 1.4330543933054394, + "grad_norm": 0.8733465671539307, + "learning_rate": 4.3532553766871235e-06, + "loss": 0.6674, + "step": 3882 + }, + { + "epoch": 1.4334235786364755, + "grad_norm": 0.8837853670120239, + "learning_rate": 4.352928900677968e-06, + "loss": 0.6681, + "step": 3883 + }, + { + "epoch": 1.4337927639675117, + "grad_norm": 0.8788067102432251, + "learning_rate": 4.35260235453604e-06, + "loss": 0.6375, + "step": 3884 + }, + { + "epoch": 1.4341619492985478, + "grad_norm": 0.8843108415603638, + "learning_rate": 4.352275738273696e-06, + "loss": 0.6594, + "step": 3885 + }, + { + "epoch": 1.4345311346295841, + "grad_norm": 0.8291038274765015, + "learning_rate": 4.351949051903301e-06, + "loss": 0.6641, + "step": 3886 + }, + { + "epoch": 1.4349003199606203, + "grad_norm": 0.8834021687507629, + "learning_rate": 4.351622295437218e-06, + "loss": 0.6217, + "step": 3887 + }, + { + "epoch": 1.4352695052916564, + "grad_norm": 0.8213536143302917, + "learning_rate": 4.351295468887816e-06, + "loss": 0.6604, + "step": 3888 + }, + { + "epoch": 1.4356386906226926, + "grad_norm": 1.0220136642456055, + "learning_rate": 4.350968572267465e-06, + "loss": 0.6856, + "step": 3889 + }, + { + "epoch": 1.4360078759537287, + "grad_norm": 0.8689857721328735, + "learning_rate": 4.3506416055885375e-06, + "loss": 0.6646, + "step": 3890 + }, + { + "epoch": 1.436377061284765, + "grad_norm": 0.8563553690910339, + "learning_rate": 4.3503145688634095e-06, + "loss": 0.7133, + "step": 3891 + }, + { + "epoch": 1.4367462466158012, + "grad_norm": 0.8633738160133362, + "learning_rate": 4.349987462104459e-06, + "loss": 0.652, + "step": 3892 + }, + { + "epoch": 1.4371154319468373, + "grad_norm": 0.858774721622467, + "learning_rate": 4.349660285324066e-06, + "loss": 0.6698, + "step": 3893 + }, + { + "epoch": 1.4374846172778735, + "grad_norm": 0.8820323348045349, + "learning_rate": 4.3493330385346165e-06, + "loss": 0.66, + "step": 3894 + }, + { + "epoch": 1.4378538026089096, + "grad_norm": 0.8748071789741516, + "learning_rate": 4.3490057217484935e-06, + "loss": 0.6615, + "step": 3895 + }, + { + "epoch": 1.438222987939946, + "grad_norm": 0.837755024433136, + "learning_rate": 4.348678334978089e-06, + "loss": 0.6492, + "step": 3896 + }, + { + "epoch": 1.438592173270982, + "grad_norm": 0.8703953623771667, + "learning_rate": 4.348350878235792e-06, + "loss": 0.6928, + "step": 3897 + }, + { + "epoch": 1.4389613586020182, + "grad_norm": 0.8987724184989929, + "learning_rate": 4.348023351533998e-06, + "loss": 0.6654, + "step": 3898 + }, + { + "epoch": 1.4393305439330544, + "grad_norm": 0.8669617176055908, + "learning_rate": 4.347695754885103e-06, + "loss": 0.6315, + "step": 3899 + }, + { + "epoch": 1.4396997292640905, + "grad_norm": 0.8876659274101257, + "learning_rate": 4.347368088301508e-06, + "loss": 0.6276, + "step": 3900 + }, + { + "epoch": 1.4400689145951269, + "grad_norm": 0.8626993298530579, + "learning_rate": 4.3470403517956125e-06, + "loss": 0.6655, + "step": 3901 + }, + { + "epoch": 1.440438099926163, + "grad_norm": 0.8691487908363342, + "learning_rate": 4.3467125453798235e-06, + "loss": 0.6727, + "step": 3902 + }, + { + "epoch": 1.4408072852571991, + "grad_norm": 0.8721747994422913, + "learning_rate": 4.346384669066547e-06, + "loss": 0.711, + "step": 3903 + }, + { + "epoch": 1.4411764705882353, + "grad_norm": 0.8662101626396179, + "learning_rate": 4.346056722868194e-06, + "loss": 0.6333, + "step": 3904 + }, + { + "epoch": 1.4415456559192714, + "grad_norm": 0.827303409576416, + "learning_rate": 4.345728706797175e-06, + "loss": 0.6101, + "step": 3905 + }, + { + "epoch": 1.4419148412503078, + "grad_norm": 0.8768917918205261, + "learning_rate": 4.345400620865909e-06, + "loss": 0.6611, + "step": 3906 + }, + { + "epoch": 1.442284026581344, + "grad_norm": 0.8730126619338989, + "learning_rate": 4.345072465086811e-06, + "loss": 0.653, + "step": 3907 + }, + { + "epoch": 1.44265321191238, + "grad_norm": 0.8697383999824524, + "learning_rate": 4.344744239472302e-06, + "loss": 0.7115, + "step": 3908 + }, + { + "epoch": 1.4430223972434162, + "grad_norm": 0.85337233543396, + "learning_rate": 4.344415944034807e-06, + "loss": 0.6754, + "step": 3909 + }, + { + "epoch": 1.4433915825744523, + "grad_norm": 0.881212055683136, + "learning_rate": 4.34408757878675e-06, + "loss": 0.6519, + "step": 3910 + }, + { + "epoch": 1.4437607679054887, + "grad_norm": 0.8855568170547485, + "learning_rate": 4.3437591437405604e-06, + "loss": 0.6416, + "step": 3911 + }, + { + "epoch": 1.4441299532365246, + "grad_norm": 0.8462011814117432, + "learning_rate": 4.343430638908669e-06, + "loss": 0.6584, + "step": 3912 + }, + { + "epoch": 1.444499138567561, + "grad_norm": 0.8876461386680603, + "learning_rate": 4.3431020643035095e-06, + "loss": 0.6414, + "step": 3913 + }, + { + "epoch": 1.444868323898597, + "grad_norm": 0.8745658993721008, + "learning_rate": 4.3427734199375185e-06, + "loss": 0.6272, + "step": 3914 + }, + { + "epoch": 1.4452375092296332, + "grad_norm": 0.8822771906852722, + "learning_rate": 4.342444705823136e-06, + "loss": 0.6232, + "step": 3915 + }, + { + "epoch": 1.4456066945606696, + "grad_norm": 0.8465991020202637, + "learning_rate": 4.342115921972801e-06, + "loss": 0.6727, + "step": 3916 + }, + { + "epoch": 1.4459758798917055, + "grad_norm": 0.9222819805145264, + "learning_rate": 4.341787068398962e-06, + "loss": 0.6901, + "step": 3917 + }, + { + "epoch": 1.4463450652227419, + "grad_norm": 0.8668354749679565, + "learning_rate": 4.341458145114062e-06, + "loss": 0.6413, + "step": 3918 + }, + { + "epoch": 1.446714250553778, + "grad_norm": 0.8493159413337708, + "learning_rate": 4.341129152130553e-06, + "loss": 0.6699, + "step": 3919 + }, + { + "epoch": 1.4470834358848141, + "grad_norm": 0.8714277148246765, + "learning_rate": 4.340800089460887e-06, + "loss": 0.6708, + "step": 3920 + }, + { + "epoch": 1.4474526212158503, + "grad_norm": 0.8666468262672424, + "learning_rate": 4.340470957117519e-06, + "loss": 0.6371, + "step": 3921 + }, + { + "epoch": 1.4478218065468864, + "grad_norm": 0.8656814694404602, + "learning_rate": 4.3401417551129045e-06, + "loss": 0.6512, + "step": 3922 + }, + { + "epoch": 1.4481909918779228, + "grad_norm": 0.8747579455375671, + "learning_rate": 4.339812483459506e-06, + "loss": 0.65, + "step": 3923 + }, + { + "epoch": 1.448560177208959, + "grad_norm": 0.8982210755348206, + "learning_rate": 4.339483142169787e-06, + "loss": 0.6853, + "step": 3924 + }, + { + "epoch": 1.448929362539995, + "grad_norm": 0.8802490234375, + "learning_rate": 4.3391537312562095e-06, + "loss": 0.6348, + "step": 3925 + }, + { + "epoch": 1.4492985478710312, + "grad_norm": 0.9059242606163025, + "learning_rate": 4.338824250731245e-06, + "loss": 0.658, + "step": 3926 + }, + { + "epoch": 1.4496677332020673, + "grad_norm": 0.8965008854866028, + "learning_rate": 4.338494700607363e-06, + "loss": 0.6945, + "step": 3927 + }, + { + "epoch": 1.4500369185331037, + "grad_norm": 0.8881044983863831, + "learning_rate": 4.3381650808970365e-06, + "loss": 0.6554, + "step": 3928 + }, + { + "epoch": 1.4504061038641398, + "grad_norm": 0.8862897753715515, + "learning_rate": 4.3378353916127415e-06, + "loss": 0.6657, + "step": 3929 + }, + { + "epoch": 1.450775289195176, + "grad_norm": 0.8965412378311157, + "learning_rate": 4.3375056327669575e-06, + "loss": 0.6751, + "step": 3930 + }, + { + "epoch": 1.451144474526212, + "grad_norm": 0.8650673031806946, + "learning_rate": 4.337175804372165e-06, + "loss": 0.6494, + "step": 3931 + }, + { + "epoch": 1.4515136598572482, + "grad_norm": 0.8366096019744873, + "learning_rate": 4.336845906440848e-06, + "loss": 0.6688, + "step": 3932 + }, + { + "epoch": 1.4518828451882846, + "grad_norm": 0.8645097017288208, + "learning_rate": 4.336515938985493e-06, + "loss": 0.6217, + "step": 3933 + }, + { + "epoch": 1.4522520305193207, + "grad_norm": 0.8718053698539734, + "learning_rate": 4.336185902018591e-06, + "loss": 0.6571, + "step": 3934 + }, + { + "epoch": 1.4526212158503569, + "grad_norm": 0.8648756742477417, + "learning_rate": 4.335855795552629e-06, + "loss": 0.6676, + "step": 3935 + }, + { + "epoch": 1.452990401181393, + "grad_norm": 0.865300714969635, + "learning_rate": 4.335525619600107e-06, + "loss": 0.6666, + "step": 3936 + }, + { + "epoch": 1.4533595865124291, + "grad_norm": 0.8761194348335266, + "learning_rate": 4.335195374173519e-06, + "loss": 0.6496, + "step": 3937 + }, + { + "epoch": 1.4537287718434655, + "grad_norm": 0.8784757852554321, + "learning_rate": 4.334865059285365e-06, + "loss": 0.6406, + "step": 3938 + }, + { + "epoch": 1.4540979571745016, + "grad_norm": 0.8550471067428589, + "learning_rate": 4.3345346749481475e-06, + "loss": 0.6421, + "step": 3939 + }, + { + "epoch": 1.4544671425055378, + "grad_norm": 0.8929871320724487, + "learning_rate": 4.334204221174372e-06, + "loss": 0.6829, + "step": 3940 + }, + { + "epoch": 1.454836327836574, + "grad_norm": 0.8830928206443787, + "learning_rate": 4.333873697976546e-06, + "loss": 0.6782, + "step": 3941 + }, + { + "epoch": 1.45520551316761, + "grad_norm": 0.8560413718223572, + "learning_rate": 4.333543105367179e-06, + "loss": 0.6672, + "step": 3942 + }, + { + "epoch": 1.4555746984986464, + "grad_norm": 0.889112114906311, + "learning_rate": 4.333212443358783e-06, + "loss": 0.6593, + "step": 3943 + }, + { + "epoch": 1.4559438838296825, + "grad_norm": 0.9032354950904846, + "learning_rate": 4.332881711963876e-06, + "loss": 0.6614, + "step": 3944 + }, + { + "epoch": 1.4563130691607187, + "grad_norm": 0.8813266754150391, + "learning_rate": 4.332550911194974e-06, + "loss": 0.6499, + "step": 3945 + }, + { + "epoch": 1.4566822544917548, + "grad_norm": 0.8566986322402954, + "learning_rate": 4.332220041064598e-06, + "loss": 0.6243, + "step": 3946 + }, + { + "epoch": 1.457051439822791, + "grad_norm": 0.8422845005989075, + "learning_rate": 4.331889101585272e-06, + "loss": 0.6496, + "step": 3947 + }, + { + "epoch": 1.4574206251538273, + "grad_norm": 0.8617449402809143, + "learning_rate": 4.331558092769522e-06, + "loss": 0.6354, + "step": 3948 + }, + { + "epoch": 1.4577898104848634, + "grad_norm": 0.8672434687614441, + "learning_rate": 4.331227014629877e-06, + "loss": 0.6488, + "step": 3949 + }, + { + "epoch": 1.4581589958158996, + "grad_norm": 0.8590326905250549, + "learning_rate": 4.330895867178866e-06, + "loss": 0.662, + "step": 3950 + }, + { + "epoch": 1.4585281811469357, + "grad_norm": 0.8981523513793945, + "learning_rate": 4.330564650429025e-06, + "loss": 0.6557, + "step": 3951 + }, + { + "epoch": 1.4588973664779719, + "grad_norm": 0.8811792731285095, + "learning_rate": 4.3302333643928886e-06, + "loss": 0.7339, + "step": 3952 + }, + { + "epoch": 1.4592665518090082, + "grad_norm": 0.9235739707946777, + "learning_rate": 4.329902009082998e-06, + "loss": 0.6454, + "step": 3953 + }, + { + "epoch": 1.4596357371400444, + "grad_norm": 0.8625849485397339, + "learning_rate": 4.329570584511894e-06, + "loss": 0.6187, + "step": 3954 + }, + { + "epoch": 1.4600049224710805, + "grad_norm": 0.8345111012458801, + "learning_rate": 4.329239090692121e-06, + "loss": 0.656, + "step": 3955 + }, + { + "epoch": 1.4603741078021166, + "grad_norm": 0.8663410544395447, + "learning_rate": 4.328907527636224e-06, + "loss": 0.6446, + "step": 3956 + }, + { + "epoch": 1.4607432931331528, + "grad_norm": 0.8366450071334839, + "learning_rate": 4.328575895356757e-06, + "loss": 0.6464, + "step": 3957 + }, + { + "epoch": 1.4611124784641891, + "grad_norm": 0.8813480138778687, + "learning_rate": 4.328244193866267e-06, + "loss": 0.6224, + "step": 3958 + }, + { + "epoch": 1.4614816637952253, + "grad_norm": 0.8699328899383545, + "learning_rate": 4.327912423177313e-06, + "loss": 0.6818, + "step": 3959 + }, + { + "epoch": 1.4618508491262614, + "grad_norm": 0.8940434455871582, + "learning_rate": 4.3275805833024495e-06, + "loss": 0.6572, + "step": 3960 + }, + { + "epoch": 1.4622200344572975, + "grad_norm": 0.8549720048904419, + "learning_rate": 4.327248674254239e-06, + "loss": 0.6382, + "step": 3961 + }, + { + "epoch": 1.4625892197883337, + "grad_norm": 0.877691924571991, + "learning_rate": 4.326916696045242e-06, + "loss": 0.6777, + "step": 3962 + }, + { + "epoch": 1.46295840511937, + "grad_norm": 0.9076605439186096, + "learning_rate": 4.326584648688025e-06, + "loss": 0.6711, + "step": 3963 + }, + { + "epoch": 1.4633275904504062, + "grad_norm": 0.8827358484268188, + "learning_rate": 4.3262525321951544e-06, + "loss": 0.647, + "step": 3964 + }, + { + "epoch": 1.4636967757814423, + "grad_norm": 0.8720611929893494, + "learning_rate": 4.325920346579203e-06, + "loss": 0.6176, + "step": 3965 + }, + { + "epoch": 1.4640659611124784, + "grad_norm": 0.8781262040138245, + "learning_rate": 4.325588091852743e-06, + "loss": 0.6623, + "step": 3966 + }, + { + "epoch": 1.4644351464435146, + "grad_norm": 0.8381863236427307, + "learning_rate": 4.325255768028349e-06, + "loss": 0.692, + "step": 3967 + }, + { + "epoch": 1.464804331774551, + "grad_norm": 0.8596665263175964, + "learning_rate": 4.3249233751186e-06, + "loss": 0.6625, + "step": 3968 + }, + { + "epoch": 1.465173517105587, + "grad_norm": 0.8398455381393433, + "learning_rate": 4.324590913136079e-06, + "loss": 0.6488, + "step": 3969 + }, + { + "epoch": 1.4655427024366232, + "grad_norm": 0.8821480870246887, + "learning_rate": 4.324258382093366e-06, + "loss": 0.6582, + "step": 3970 + }, + { + "epoch": 1.4659118877676593, + "grad_norm": 0.8625421524047852, + "learning_rate": 4.323925782003049e-06, + "loss": 0.6982, + "step": 3971 + }, + { + "epoch": 1.4662810730986955, + "grad_norm": 0.859168529510498, + "learning_rate": 4.3235931128777176e-06, + "loss": 0.6275, + "step": 3972 + }, + { + "epoch": 1.4666502584297318, + "grad_norm": 0.8612698316574097, + "learning_rate": 4.323260374729962e-06, + "loss": 0.6391, + "step": 3973 + }, + { + "epoch": 1.467019443760768, + "grad_norm": 0.856327474117279, + "learning_rate": 4.322927567572377e-06, + "loss": 0.6818, + "step": 3974 + }, + { + "epoch": 1.4673886290918041, + "grad_norm": 0.8736692667007446, + "learning_rate": 4.322594691417559e-06, + "loss": 0.6263, + "step": 3975 + }, + { + "epoch": 1.4677578144228403, + "grad_norm": 0.84726482629776, + "learning_rate": 4.322261746278107e-06, + "loss": 0.6302, + "step": 3976 + }, + { + "epoch": 1.4681269997538764, + "grad_norm": 0.8586804270744324, + "learning_rate": 4.321928732166623e-06, + "loss": 0.6947, + "step": 3977 + }, + { + "epoch": 1.4684961850849128, + "grad_norm": 0.8999873399734497, + "learning_rate": 4.321595649095711e-06, + "loss": 0.6515, + "step": 3978 + }, + { + "epoch": 1.4688653704159487, + "grad_norm": 0.8899109363555908, + "learning_rate": 4.321262497077979e-06, + "loss": 0.6783, + "step": 3979 + }, + { + "epoch": 1.469234555746985, + "grad_norm": 0.8734593987464905, + "learning_rate": 4.3209292761260354e-06, + "loss": 0.6239, + "step": 3980 + }, + { + "epoch": 1.4696037410780212, + "grad_norm": 0.8852055668830872, + "learning_rate": 4.320595986252495e-06, + "loss": 0.67, + "step": 3981 + }, + { + "epoch": 1.4699729264090573, + "grad_norm": 0.8622931241989136, + "learning_rate": 4.32026262746997e-06, + "loss": 0.6296, + "step": 3982 + }, + { + "epoch": 1.4703421117400937, + "grad_norm": 0.8493199348449707, + "learning_rate": 4.31992919979108e-06, + "loss": 0.6679, + "step": 3983 + }, + { + "epoch": 1.4707112970711296, + "grad_norm": 0.8741939663887024, + "learning_rate": 4.319595703228443e-06, + "loss": 0.6555, + "step": 3984 + }, + { + "epoch": 1.471080482402166, + "grad_norm": 0.8824664354324341, + "learning_rate": 4.319262137794683e-06, + "loss": 0.6722, + "step": 3985 + }, + { + "epoch": 1.471449667733202, + "grad_norm": 0.8817678689956665, + "learning_rate": 4.318928503502426e-06, + "loss": 0.6593, + "step": 3986 + }, + { + "epoch": 1.4718188530642382, + "grad_norm": 0.8646631836891174, + "learning_rate": 4.318594800364299e-06, + "loss": 0.6407, + "step": 3987 + }, + { + "epoch": 1.4721880383952743, + "grad_norm": 0.8642030358314514, + "learning_rate": 4.318261028392933e-06, + "loss": 0.671, + "step": 3988 + }, + { + "epoch": 1.4725572237263105, + "grad_norm": 0.8524730801582336, + "learning_rate": 4.317927187600961e-06, + "loss": 0.627, + "step": 3989 + }, + { + "epoch": 1.4729264090573468, + "grad_norm": 0.8443734645843506, + "learning_rate": 4.317593278001018e-06, + "loss": 0.66, + "step": 3990 + }, + { + "epoch": 1.473295594388383, + "grad_norm": 0.8545972108840942, + "learning_rate": 4.317259299605743e-06, + "loss": 0.6634, + "step": 3991 + }, + { + "epoch": 1.4736647797194191, + "grad_norm": 0.8763890862464905, + "learning_rate": 4.316925252427777e-06, + "loss": 0.6695, + "step": 3992 + }, + { + "epoch": 1.4740339650504553, + "grad_norm": 0.8736660480499268, + "learning_rate": 4.316591136479764e-06, + "loss": 0.675, + "step": 3993 + }, + { + "epoch": 1.4744031503814914, + "grad_norm": 0.8856341242790222, + "learning_rate": 4.316256951774351e-06, + "loss": 0.6436, + "step": 3994 + }, + { + "epoch": 1.4747723357125277, + "grad_norm": 0.8500943183898926, + "learning_rate": 4.315922698324184e-06, + "loss": 0.6482, + "step": 3995 + }, + { + "epoch": 1.4751415210435639, + "grad_norm": 0.8709170818328857, + "learning_rate": 4.315588376141916e-06, + "loss": 0.6291, + "step": 3996 + }, + { + "epoch": 1.4755107063746, + "grad_norm": 0.8660770058631897, + "learning_rate": 4.315253985240201e-06, + "loss": 0.6808, + "step": 3997 + }, + { + "epoch": 1.4758798917056362, + "grad_norm": 0.8607604503631592, + "learning_rate": 4.314919525631696e-06, + "loss": 0.6356, + "step": 3998 + }, + { + "epoch": 1.4762490770366723, + "grad_norm": 0.8453649878501892, + "learning_rate": 4.31458499732906e-06, + "loss": 0.6396, + "step": 3999 + }, + { + "epoch": 1.4766182623677087, + "grad_norm": 0.8565241098403931, + "learning_rate": 4.314250400344954e-06, + "loss": 0.6512, + "step": 4000 + }, + { + "epoch": 1.4769874476987448, + "grad_norm": 0.8427796959877014, + "learning_rate": 4.313915734692044e-06, + "loss": 0.6857, + "step": 4001 + }, + { + "epoch": 1.477356633029781, + "grad_norm": 0.9255577921867371, + "learning_rate": 4.313581000382995e-06, + "loss": 0.6428, + "step": 4002 + }, + { + "epoch": 1.477725818360817, + "grad_norm": 0.8371117115020752, + "learning_rate": 4.313246197430478e-06, + "loss": 0.6304, + "step": 4003 + }, + { + "epoch": 1.4780950036918532, + "grad_norm": 0.8278629779815674, + "learning_rate": 4.3129113258471635e-06, + "loss": 0.6248, + "step": 4004 + }, + { + "epoch": 1.4784641890228896, + "grad_norm": 0.8898910880088806, + "learning_rate": 4.312576385645728e-06, + "loss": 0.7104, + "step": 4005 + }, + { + "epoch": 1.4788333743539257, + "grad_norm": 0.8577858805656433, + "learning_rate": 4.312241376838849e-06, + "loss": 0.5926, + "step": 4006 + }, + { + "epoch": 1.4792025596849618, + "grad_norm": 0.8747662901878357, + "learning_rate": 4.3119062994392055e-06, + "loss": 0.6244, + "step": 4007 + }, + { + "epoch": 1.479571745015998, + "grad_norm": 0.8819793462753296, + "learning_rate": 4.31157115345948e-06, + "loss": 0.6511, + "step": 4008 + }, + { + "epoch": 1.4799409303470341, + "grad_norm": 0.8706238865852356, + "learning_rate": 4.311235938912359e-06, + "loss": 0.6683, + "step": 4009 + }, + { + "epoch": 1.4803101156780705, + "grad_norm": 0.8562942743301392, + "learning_rate": 4.310900655810528e-06, + "loss": 0.6737, + "step": 4010 + }, + { + "epoch": 1.4806793010091066, + "grad_norm": 0.9153907299041748, + "learning_rate": 4.31056530416668e-06, + "loss": 0.6684, + "step": 4011 + }, + { + "epoch": 1.4810484863401427, + "grad_norm": 0.8702272176742554, + "learning_rate": 4.310229883993506e-06, + "loss": 0.6295, + "step": 4012 + }, + { + "epoch": 1.4814176716711789, + "grad_norm": 0.8874361515045166, + "learning_rate": 4.309894395303703e-06, + "loss": 0.6393, + "step": 4013 + }, + { + "epoch": 1.481786857002215, + "grad_norm": 0.8515747785568237, + "learning_rate": 4.309558838109967e-06, + "loss": 0.6857, + "step": 4014 + }, + { + "epoch": 1.4821560423332514, + "grad_norm": 0.8663908243179321, + "learning_rate": 4.309223212425001e-06, + "loss": 0.6702, + "step": 4015 + }, + { + "epoch": 1.4825252276642875, + "grad_norm": 0.8743873238563538, + "learning_rate": 4.308887518261507e-06, + "loss": 0.7008, + "step": 4016 + }, + { + "epoch": 1.4828944129953237, + "grad_norm": 0.8690647482872009, + "learning_rate": 4.3085517556321925e-06, + "loss": 0.6619, + "step": 4017 + }, + { + "epoch": 1.4832635983263598, + "grad_norm": 0.8630088567733765, + "learning_rate": 4.308215924549764e-06, + "loss": 0.5832, + "step": 4018 + }, + { + "epoch": 1.483632783657396, + "grad_norm": 0.8403893113136292, + "learning_rate": 4.307880025026933e-06, + "loss": 0.6756, + "step": 4019 + }, + { + "epoch": 1.4840019689884323, + "grad_norm": 0.8611646890640259, + "learning_rate": 4.307544057076414e-06, + "loss": 0.6558, + "step": 4020 + }, + { + "epoch": 1.4843711543194684, + "grad_norm": 0.9606013894081116, + "learning_rate": 4.307208020710922e-06, + "loss": 0.6745, + "step": 4021 + }, + { + "epoch": 1.4847403396505046, + "grad_norm": 0.8890616297721863, + "learning_rate": 4.306871915943178e-06, + "loss": 0.6578, + "step": 4022 + }, + { + "epoch": 1.4851095249815407, + "grad_norm": 0.9102274775505066, + "learning_rate": 4.306535742785902e-06, + "loss": 0.6791, + "step": 4023 + }, + { + "epoch": 1.4854787103125768, + "grad_norm": 0.8693407773971558, + "learning_rate": 4.306199501251818e-06, + "loss": 0.6696, + "step": 4024 + }, + { + "epoch": 1.4858478956436132, + "grad_norm": 0.8936747908592224, + "learning_rate": 4.305863191353653e-06, + "loss": 0.6725, + "step": 4025 + }, + { + "epoch": 1.4862170809746493, + "grad_norm": 0.8767732381820679, + "learning_rate": 4.305526813104137e-06, + "loss": 0.6745, + "step": 4026 + }, + { + "epoch": 1.4865862663056855, + "grad_norm": 0.8594468235969543, + "learning_rate": 4.305190366516e-06, + "loss": 0.6496, + "step": 4027 + }, + { + "epoch": 1.4869554516367216, + "grad_norm": 0.8595309853553772, + "learning_rate": 4.3048538516019775e-06, + "loss": 0.6506, + "step": 4028 + }, + { + "epoch": 1.4873246369677577, + "grad_norm": 0.862367570400238, + "learning_rate": 4.304517268374806e-06, + "loss": 0.6591, + "step": 4029 + }, + { + "epoch": 1.487693822298794, + "grad_norm": 0.8560047745704651, + "learning_rate": 4.304180616847226e-06, + "loss": 0.6602, + "step": 4030 + }, + { + "epoch": 1.4880630076298302, + "grad_norm": 0.8659210801124573, + "learning_rate": 4.303843897031979e-06, + "loss": 0.6092, + "step": 4031 + }, + { + "epoch": 1.4884321929608664, + "grad_norm": 0.8587557673454285, + "learning_rate": 4.3035071089418095e-06, + "loss": 0.6502, + "step": 4032 + }, + { + "epoch": 1.4888013782919025, + "grad_norm": 0.8538966178894043, + "learning_rate": 4.303170252589465e-06, + "loss": 0.6469, + "step": 4033 + }, + { + "epoch": 1.4891705636229386, + "grad_norm": 0.8749778270721436, + "learning_rate": 4.302833327987696e-06, + "loss": 0.6749, + "step": 4034 + }, + { + "epoch": 1.489539748953975, + "grad_norm": 0.8690160512924194, + "learning_rate": 4.302496335149255e-06, + "loss": 0.6427, + "step": 4035 + }, + { + "epoch": 1.4899089342850111, + "grad_norm": 0.8996536135673523, + "learning_rate": 4.302159274086896e-06, + "loss": 0.6701, + "step": 4036 + }, + { + "epoch": 1.4902781196160473, + "grad_norm": 0.8690139055252075, + "learning_rate": 4.301822144813377e-06, + "loss": 0.6483, + "step": 4037 + }, + { + "epoch": 1.4906473049470834, + "grad_norm": 0.8757538795471191, + "learning_rate": 4.301484947341458e-06, + "loss": 0.6417, + "step": 4038 + }, + { + "epoch": 1.4910164902781196, + "grad_norm": 0.8546977639198303, + "learning_rate": 4.3011476816839036e-06, + "loss": 0.6261, + "step": 4039 + }, + { + "epoch": 1.491385675609156, + "grad_norm": 0.8905868530273438, + "learning_rate": 4.300810347853477e-06, + "loss": 0.6524, + "step": 4040 + }, + { + "epoch": 1.491754860940192, + "grad_norm": 0.8751607537269592, + "learning_rate": 4.300472945862947e-06, + "loss": 0.6448, + "step": 4041 + }, + { + "epoch": 1.4921240462712282, + "grad_norm": 0.8500416874885559, + "learning_rate": 4.3001354757250845e-06, + "loss": 0.6487, + "step": 4042 + }, + { + "epoch": 1.4924932316022643, + "grad_norm": 0.8601714968681335, + "learning_rate": 4.299797937452662e-06, + "loss": 0.6424, + "step": 4043 + }, + { + "epoch": 1.4928624169333005, + "grad_norm": 0.8698818683624268, + "learning_rate": 4.2994603310584566e-06, + "loss": 0.6622, + "step": 4044 + }, + { + "epoch": 1.4932316022643368, + "grad_norm": 0.840782642364502, + "learning_rate": 4.299122656555245e-06, + "loss": 0.6635, + "step": 4045 + }, + { + "epoch": 1.493600787595373, + "grad_norm": 0.8770154714584351, + "learning_rate": 4.298784913955809e-06, + "loss": 0.6947, + "step": 4046 + }, + { + "epoch": 1.493969972926409, + "grad_norm": 0.8693561553955078, + "learning_rate": 4.2984471032729315e-06, + "loss": 0.636, + "step": 4047 + }, + { + "epoch": 1.4943391582574452, + "grad_norm": 0.8588740825653076, + "learning_rate": 4.298109224519398e-06, + "loss": 0.6629, + "step": 4048 + }, + { + "epoch": 1.4947083435884814, + "grad_norm": 0.8925417065620422, + "learning_rate": 4.297771277707998e-06, + "loss": 0.6442, + "step": 4049 + }, + { + "epoch": 1.4950775289195177, + "grad_norm": 0.8721441030502319, + "learning_rate": 4.297433262851523e-06, + "loss": 0.6636, + "step": 4050 + }, + { + "epoch": 1.4954467142505536, + "grad_norm": 0.8867325782775879, + "learning_rate": 4.297095179962767e-06, + "loss": 0.6987, + "step": 4051 + }, + { + "epoch": 1.49581589958159, + "grad_norm": 0.8766324520111084, + "learning_rate": 4.296757029054524e-06, + "loss": 0.6564, + "step": 4052 + }, + { + "epoch": 1.4961850849126261, + "grad_norm": 0.8506335616111755, + "learning_rate": 4.2964188101395964e-06, + "loss": 0.685, + "step": 4053 + }, + { + "epoch": 1.4965542702436623, + "grad_norm": 0.8920146226882935, + "learning_rate": 4.296080523230783e-06, + "loss": 0.6619, + "step": 4054 + }, + { + "epoch": 1.4969234555746986, + "grad_norm": 0.863493800163269, + "learning_rate": 4.295742168340888e-06, + "loss": 0.6595, + "step": 4055 + }, + { + "epoch": 1.4972926409057346, + "grad_norm": 0.8447588682174683, + "learning_rate": 4.295403745482719e-06, + "loss": 0.6422, + "step": 4056 + }, + { + "epoch": 1.497661826236771, + "grad_norm": 0.8743113875389099, + "learning_rate": 4.295065254669085e-06, + "loss": 0.646, + "step": 4057 + }, + { + "epoch": 1.498031011567807, + "grad_norm": 0.8360143899917603, + "learning_rate": 4.294726695912798e-06, + "loss": 0.6712, + "step": 4058 + }, + { + "epoch": 1.4984001968988432, + "grad_norm": 0.9043782353401184, + "learning_rate": 4.294388069226672e-06, + "loss": 0.6534, + "step": 4059 + }, + { + "epoch": 1.4987693822298793, + "grad_norm": 0.8805797696113586, + "learning_rate": 4.294049374623523e-06, + "loss": 0.6487, + "step": 4060 + }, + { + "epoch": 1.4991385675609155, + "grad_norm": 0.8812898993492126, + "learning_rate": 4.293710612116172e-06, + "loss": 0.7126, + "step": 4061 + }, + { + "epoch": 1.4995077528919518, + "grad_norm": 0.8640240430831909, + "learning_rate": 4.293371781717441e-06, + "loss": 0.6717, + "step": 4062 + }, + { + "epoch": 1.499876938222988, + "grad_norm": 0.8877006769180298, + "learning_rate": 4.293032883440152e-06, + "loss": 0.7001, + "step": 4063 + }, + { + "epoch": 1.500246123554024, + "grad_norm": 0.8582589626312256, + "learning_rate": 4.292693917297136e-06, + "loss": 0.5952, + "step": 4064 + }, + { + "epoch": 1.5006153088850605, + "grad_norm": 0.8693685531616211, + "learning_rate": 4.292354883301221e-06, + "loss": 0.6581, + "step": 4065 + }, + { + "epoch": 1.5009844942160964, + "grad_norm": 0.8873693346977234, + "learning_rate": 4.292015781465239e-06, + "loss": 0.6749, + "step": 4066 + }, + { + "epoch": 1.5013536795471327, + "grad_norm": 0.8655200600624084, + "learning_rate": 4.291676611802026e-06, + "loss": 0.6304, + "step": 4067 + }, + { + "epoch": 1.5017228648781689, + "grad_norm": 0.8552690148353577, + "learning_rate": 4.291337374324417e-06, + "loss": 0.6726, + "step": 4068 + }, + { + "epoch": 1.502092050209205, + "grad_norm": 0.8689244985580444, + "learning_rate": 4.290998069045255e-06, + "loss": 0.6281, + "step": 4069 + }, + { + "epoch": 1.5024612355402414, + "grad_norm": 0.8591857552528381, + "learning_rate": 4.290658695977381e-06, + "loss": 0.6353, + "step": 4070 + }, + { + "epoch": 1.5028304208712773, + "grad_norm": 0.870262861251831, + "learning_rate": 4.29031925513364e-06, + "loss": 0.6965, + "step": 4071 + }, + { + "epoch": 1.5031996062023136, + "grad_norm": 0.8554145097732544, + "learning_rate": 4.289979746526881e-06, + "loss": 0.6621, + "step": 4072 + }, + { + "epoch": 1.5035687915333498, + "grad_norm": 0.8629531860351562, + "learning_rate": 4.289640170169953e-06, + "loss": 0.6621, + "step": 4073 + }, + { + "epoch": 1.503937976864386, + "grad_norm": 0.8806639909744263, + "learning_rate": 4.2893005260757105e-06, + "loss": 0.6896, + "step": 4074 + }, + { + "epoch": 1.5043071621954223, + "grad_norm": 0.873180627822876, + "learning_rate": 4.288960814257006e-06, + "loss": 0.6342, + "step": 4075 + }, + { + "epoch": 1.5046763475264582, + "grad_norm": 0.8682287931442261, + "learning_rate": 4.288621034726701e-06, + "loss": 0.6242, + "step": 4076 + }, + { + "epoch": 1.5050455328574945, + "grad_norm": 0.8361064791679382, + "learning_rate": 4.288281187497653e-06, + "loss": 0.6628, + "step": 4077 + }, + { + "epoch": 1.5054147181885307, + "grad_norm": 0.8703436255455017, + "learning_rate": 4.287941272582727e-06, + "loss": 0.69, + "step": 4078 + }, + { + "epoch": 1.5057839035195668, + "grad_norm": 0.8886205554008484, + "learning_rate": 4.287601289994789e-06, + "loss": 0.6552, + "step": 4079 + }, + { + "epoch": 1.506153088850603, + "grad_norm": 0.8863624334335327, + "learning_rate": 4.2872612397467046e-06, + "loss": 0.681, + "step": 4080 + }, + { + "epoch": 1.506522274181639, + "grad_norm": 0.8850600719451904, + "learning_rate": 4.286921121851347e-06, + "loss": 0.6583, + "step": 4081 + }, + { + "epoch": 1.5068914595126754, + "grad_norm": 0.8744266629219055, + "learning_rate": 4.28658093632159e-06, + "loss": 0.6608, + "step": 4082 + }, + { + "epoch": 1.5072606448437116, + "grad_norm": 0.8750613331794739, + "learning_rate": 4.286240683170307e-06, + "loss": 0.6774, + "step": 4083 + }, + { + "epoch": 1.5076298301747477, + "grad_norm": 0.8728808760643005, + "learning_rate": 4.2859003624103775e-06, + "loss": 0.6952, + "step": 4084 + }, + { + "epoch": 1.5079990155057839, + "grad_norm": 0.9113942384719849, + "learning_rate": 4.285559974054685e-06, + "loss": 0.6863, + "step": 4085 + }, + { + "epoch": 1.50836820083682, + "grad_norm": 0.9054762721061707, + "learning_rate": 4.2852195181161095e-06, + "loss": 0.663, + "step": 4086 + }, + { + "epoch": 1.5087373861678564, + "grad_norm": 0.863274335861206, + "learning_rate": 4.284878994607539e-06, + "loss": 0.6875, + "step": 4087 + }, + { + "epoch": 1.5091065714988925, + "grad_norm": 0.9011262059211731, + "learning_rate": 4.284538403541862e-06, + "loss": 0.7108, + "step": 4088 + }, + { + "epoch": 1.5094757568299286, + "grad_norm": 0.8890613913536072, + "learning_rate": 4.284197744931969e-06, + "loss": 0.6324, + "step": 4089 + }, + { + "epoch": 1.5098449421609648, + "grad_norm": 0.8556955456733704, + "learning_rate": 4.2838570187907556e-06, + "loss": 0.6489, + "step": 4090 + }, + { + "epoch": 1.510214127492001, + "grad_norm": 0.9095219969749451, + "learning_rate": 4.283516225131116e-06, + "loss": 0.6437, + "step": 4091 + }, + { + "epoch": 1.5105833128230373, + "grad_norm": 0.8610695600509644, + "learning_rate": 4.283175363965951e-06, + "loss": 0.6386, + "step": 4092 + }, + { + "epoch": 1.5109524981540734, + "grad_norm": 0.88383549451828, + "learning_rate": 4.282834435308161e-06, + "loss": 0.6789, + "step": 4093 + }, + { + "epoch": 1.5113216834851095, + "grad_norm": 0.8785969614982605, + "learning_rate": 4.28249343917065e-06, + "loss": 0.6695, + "step": 4094 + }, + { + "epoch": 1.5116908688161457, + "grad_norm": 0.9116064310073853, + "learning_rate": 4.2821523755663245e-06, + "loss": 0.6848, + "step": 4095 + }, + { + "epoch": 1.5120600541471818, + "grad_norm": 0.8734719157218933, + "learning_rate": 4.281811244508094e-06, + "loss": 0.6425, + "step": 4096 + }, + { + "epoch": 1.5124292394782182, + "grad_norm": 0.8691825270652771, + "learning_rate": 4.2814700460088705e-06, + "loss": 0.6504, + "step": 4097 + }, + { + "epoch": 1.512798424809254, + "grad_norm": 0.8592444062232971, + "learning_rate": 4.2811287800815695e-06, + "loss": 0.6459, + "step": 4098 + }, + { + "epoch": 1.5131676101402904, + "grad_norm": 0.872750461101532, + "learning_rate": 4.280787446739105e-06, + "loss": 0.6559, + "step": 4099 + }, + { + "epoch": 1.5135367954713266, + "grad_norm": 0.8969438672065735, + "learning_rate": 4.280446045994398e-06, + "loss": 0.6134, + "step": 4100 + }, + { + "epoch": 1.5139059808023627, + "grad_norm": 0.8667454123497009, + "learning_rate": 4.2801045778603685e-06, + "loss": 0.6463, + "step": 4101 + }, + { + "epoch": 1.514275166133399, + "grad_norm": 0.8824355006217957, + "learning_rate": 4.2797630423499445e-06, + "loss": 0.6768, + "step": 4102 + }, + { + "epoch": 1.514644351464435, + "grad_norm": 0.8691312074661255, + "learning_rate": 4.27942143947605e-06, + "loss": 0.652, + "step": 4103 + }, + { + "epoch": 1.5150135367954713, + "grad_norm": 0.8782834410667419, + "learning_rate": 4.279079769251617e-06, + "loss": 0.6687, + "step": 4104 + }, + { + "epoch": 1.5153827221265075, + "grad_norm": 0.859470784664154, + "learning_rate": 4.278738031689574e-06, + "loss": 0.6207, + "step": 4105 + }, + { + "epoch": 1.5157519074575436, + "grad_norm": 0.8601518869400024, + "learning_rate": 4.27839622680286e-06, + "loss": 0.6321, + "step": 4106 + }, + { + "epoch": 1.51612109278858, + "grad_norm": 0.8625545501708984, + "learning_rate": 4.27805435460441e-06, + "loss": 0.6222, + "step": 4107 + }, + { + "epoch": 1.516490278119616, + "grad_norm": 0.853982150554657, + "learning_rate": 4.277712415107163e-06, + "loss": 0.6746, + "step": 4108 + }, + { + "epoch": 1.5168594634506523, + "grad_norm": 0.8507782816886902, + "learning_rate": 4.277370408324061e-06, + "loss": 0.6427, + "step": 4109 + }, + { + "epoch": 1.5172286487816884, + "grad_norm": 0.8598101139068604, + "learning_rate": 4.277028334268052e-06, + "loss": 0.6474, + "step": 4110 + }, + { + "epoch": 1.5175978341127245, + "grad_norm": 0.8928685188293457, + "learning_rate": 4.276686192952079e-06, + "loss": 0.6673, + "step": 4111 + }, + { + "epoch": 1.517967019443761, + "grad_norm": 0.8757143020629883, + "learning_rate": 4.276343984389098e-06, + "loss": 0.6556, + "step": 4112 + }, + { + "epoch": 1.5183362047747968, + "grad_norm": 0.8892792463302612, + "learning_rate": 4.276001708592055e-06, + "loss": 0.6595, + "step": 4113 + }, + { + "epoch": 1.5187053901058332, + "grad_norm": 0.8966202735900879, + "learning_rate": 4.275659365573908e-06, + "loss": 0.6533, + "step": 4114 + }, + { + "epoch": 1.5190745754368693, + "grad_norm": 0.9023223519325256, + "learning_rate": 4.275316955347616e-06, + "loss": 0.6536, + "step": 4115 + }, + { + "epoch": 1.5194437607679054, + "grad_norm": 0.8740276098251343, + "learning_rate": 4.274974477926136e-06, + "loss": 0.6467, + "step": 4116 + }, + { + "epoch": 1.5198129460989418, + "grad_norm": 0.8580527305603027, + "learning_rate": 4.274631933322433e-06, + "loss": 0.6339, + "step": 4117 + }, + { + "epoch": 1.5201821314299777, + "grad_norm": 0.8442912101745605, + "learning_rate": 4.274289321549471e-06, + "loss": 0.6491, + "step": 4118 + }, + { + "epoch": 1.520551316761014, + "grad_norm": 0.8857823610305786, + "learning_rate": 4.273946642620218e-06, + "loss": 0.6835, + "step": 4119 + }, + { + "epoch": 1.5209205020920502, + "grad_norm": 0.8766312003135681, + "learning_rate": 4.273603896547645e-06, + "loss": 0.6239, + "step": 4120 + }, + { + "epoch": 1.5212896874230863, + "grad_norm": 0.867927074432373, + "learning_rate": 4.2732610833447236e-06, + "loss": 0.661, + "step": 4121 + }, + { + "epoch": 1.5216588727541227, + "grad_norm": 0.853773295879364, + "learning_rate": 4.2729182030244296e-06, + "loss": 0.6219, + "step": 4122 + }, + { + "epoch": 1.5220280580851586, + "grad_norm": 0.8582534790039062, + "learning_rate": 4.272575255599743e-06, + "loss": 0.6741, + "step": 4123 + }, + { + "epoch": 1.522397243416195, + "grad_norm": 0.8949733376502991, + "learning_rate": 4.27223224108364e-06, + "loss": 0.6326, + "step": 4124 + }, + { + "epoch": 1.5227664287472311, + "grad_norm": 0.8849816918373108, + "learning_rate": 4.271889159489108e-06, + "loss": 0.6662, + "step": 4125 + }, + { + "epoch": 1.5231356140782673, + "grad_norm": 0.899662435054779, + "learning_rate": 4.2715460108291296e-06, + "loss": 0.6838, + "step": 4126 + }, + { + "epoch": 1.5235047994093036, + "grad_norm": 0.917032778263092, + "learning_rate": 4.271202795116694e-06, + "loss": 0.6807, + "step": 4127 + }, + { + "epoch": 1.5238739847403395, + "grad_norm": 0.8956341743469238, + "learning_rate": 4.270859512364791e-06, + "loss": 0.6565, + "step": 4128 + }, + { + "epoch": 1.5242431700713759, + "grad_norm": 0.875110924243927, + "learning_rate": 4.270516162586415e-06, + "loss": 0.6784, + "step": 4129 + }, + { + "epoch": 1.524612355402412, + "grad_norm": 0.8935921788215637, + "learning_rate": 4.270172745794562e-06, + "loss": 0.6455, + "step": 4130 + }, + { + "epoch": 1.5249815407334482, + "grad_norm": 0.8863093256950378, + "learning_rate": 4.2698292620022275e-06, + "loss": 0.663, + "step": 4131 + }, + { + "epoch": 1.5253507260644845, + "grad_norm": 0.903451144695282, + "learning_rate": 4.269485711222414e-06, + "loss": 0.7022, + "step": 4132 + }, + { + "epoch": 1.5257199113955204, + "grad_norm": 0.8865953683853149, + "learning_rate": 4.269142093468126e-06, + "loss": 0.6422, + "step": 4133 + }, + { + "epoch": 1.5260890967265568, + "grad_norm": 0.8502559065818787, + "learning_rate": 4.2687984087523675e-06, + "loss": 0.6385, + "step": 4134 + }, + { + "epoch": 1.526458282057593, + "grad_norm": 0.8693287968635559, + "learning_rate": 4.268454657088148e-06, + "loss": 0.6515, + "step": 4135 + }, + { + "epoch": 1.526827467388629, + "grad_norm": 0.8611676096916199, + "learning_rate": 4.2681108384884775e-06, + "loss": 0.6649, + "step": 4136 + }, + { + "epoch": 1.5271966527196654, + "grad_norm": 0.8676669597625732, + "learning_rate": 4.267766952966369e-06, + "loss": 0.646, + "step": 4137 + }, + { + "epoch": 1.5275658380507013, + "grad_norm": 0.8639422655105591, + "learning_rate": 4.267423000534841e-06, + "loss": 0.6601, + "step": 4138 + }, + { + "epoch": 1.5279350233817377, + "grad_norm": 0.8903855681419373, + "learning_rate": 4.267078981206908e-06, + "loss": 0.6673, + "step": 4139 + }, + { + "epoch": 1.5283042087127738, + "grad_norm": 0.8872783780097961, + "learning_rate": 4.266734894995595e-06, + "loss": 0.6374, + "step": 4140 + }, + { + "epoch": 1.52867339404381, + "grad_norm": 0.8437135219573975, + "learning_rate": 4.266390741913923e-06, + "loss": 0.673, + "step": 4141 + }, + { + "epoch": 1.5290425793748463, + "grad_norm": 0.8592482805252075, + "learning_rate": 4.266046521974919e-06, + "loss": 0.6377, + "step": 4142 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 0.9012287259101868, + "learning_rate": 4.265702235191612e-06, + "loss": 0.6759, + "step": 4143 + }, + { + "epoch": 1.5297809500369186, + "grad_norm": 0.8785265684127808, + "learning_rate": 4.2653578815770326e-06, + "loss": 0.6699, + "step": 4144 + }, + { + "epoch": 1.5301501353679547, + "grad_norm": 0.8822756409645081, + "learning_rate": 4.265013461144214e-06, + "loss": 0.6708, + "step": 4145 + }, + { + "epoch": 1.5305193206989909, + "grad_norm": 0.8577251434326172, + "learning_rate": 4.264668973906193e-06, + "loss": 0.6445, + "step": 4146 + }, + { + "epoch": 1.530888506030027, + "grad_norm": 0.8777412176132202, + "learning_rate": 4.264324419876008e-06, + "loss": 0.6609, + "step": 4147 + }, + { + "epoch": 1.5312576913610632, + "grad_norm": 0.8454368114471436, + "learning_rate": 4.2639797990667016e-06, + "loss": 0.6269, + "step": 4148 + }, + { + "epoch": 1.5316268766920995, + "grad_norm": 0.8819066286087036, + "learning_rate": 4.263635111491316e-06, + "loss": 0.663, + "step": 4149 + }, + { + "epoch": 1.5319960620231357, + "grad_norm": 0.8797821998596191, + "learning_rate": 4.263290357162897e-06, + "loss": 0.6742, + "step": 4150 + }, + { + "epoch": 1.5323652473541718, + "grad_norm": 0.8543263077735901, + "learning_rate": 4.262945536094496e-06, + "loss": 0.6324, + "step": 4151 + }, + { + "epoch": 1.532734432685208, + "grad_norm": 0.863166093826294, + "learning_rate": 4.262600648299162e-06, + "loss": 0.644, + "step": 4152 + }, + { + "epoch": 1.533103618016244, + "grad_norm": 0.8374249339103699, + "learning_rate": 4.2622556937899505e-06, + "loss": 0.617, + "step": 4153 + }, + { + "epoch": 1.5334728033472804, + "grad_norm": 0.8848808407783508, + "learning_rate": 4.261910672579916e-06, + "loss": 0.6654, + "step": 4154 + }, + { + "epoch": 1.5338419886783166, + "grad_norm": 0.8804117441177368, + "learning_rate": 4.26156558468212e-06, + "loss": 0.6696, + "step": 4155 + }, + { + "epoch": 1.5342111740093527, + "grad_norm": 0.8797795176506042, + "learning_rate": 4.261220430109622e-06, + "loss": 0.6364, + "step": 4156 + }, + { + "epoch": 1.5345803593403888, + "grad_norm": 0.8838468194007874, + "learning_rate": 4.2608752088754865e-06, + "loss": 0.6501, + "step": 4157 + }, + { + "epoch": 1.534949544671425, + "grad_norm": 0.8760209083557129, + "learning_rate": 4.2605299209927804e-06, + "loss": 0.6799, + "step": 4158 + }, + { + "epoch": 1.5353187300024613, + "grad_norm": 0.8833801746368408, + "learning_rate": 4.260184566474572e-06, + "loss": 0.6588, + "step": 4159 + }, + { + "epoch": 1.5356879153334975, + "grad_norm": 0.8775025010108948, + "learning_rate": 4.259839145333933e-06, + "loss": 0.6488, + "step": 4160 + }, + { + "epoch": 1.5360571006645336, + "grad_norm": 0.8607179522514343, + "learning_rate": 4.259493657583938e-06, + "loss": 0.6943, + "step": 4161 + }, + { + "epoch": 1.5364262859955697, + "grad_norm": 0.8753871321678162, + "learning_rate": 4.259148103237663e-06, + "loss": 0.6663, + "step": 4162 + }, + { + "epoch": 1.5367954713266059, + "grad_norm": 0.8722121119499207, + "learning_rate": 4.258802482308189e-06, + "loss": 0.6718, + "step": 4163 + }, + { + "epoch": 1.5371646566576422, + "grad_norm": 0.8868932723999023, + "learning_rate": 4.258456794808595e-06, + "loss": 0.643, + "step": 4164 + }, + { + "epoch": 1.5375338419886782, + "grad_norm": 0.8564817309379578, + "learning_rate": 4.258111040751966e-06, + "loss": 0.6293, + "step": 4165 + }, + { + "epoch": 1.5379030273197145, + "grad_norm": 0.8446234464645386, + "learning_rate": 4.25776522015139e-06, + "loss": 0.6772, + "step": 4166 + }, + { + "epoch": 1.5382722126507506, + "grad_norm": 0.8906815052032471, + "learning_rate": 4.257419333019954e-06, + "loss": 0.6296, + "step": 4167 + }, + { + "epoch": 1.5386413979817868, + "grad_norm": 0.8602213859558105, + "learning_rate": 4.257073379370752e-06, + "loss": 0.6812, + "step": 4168 + }, + { + "epoch": 1.5390105833128231, + "grad_norm": 0.8777429461479187, + "learning_rate": 4.256727359216876e-06, + "loss": 0.6625, + "step": 4169 + }, + { + "epoch": 1.539379768643859, + "grad_norm": 0.8831945657730103, + "learning_rate": 4.256381272571425e-06, + "loss": 0.6609, + "step": 4170 + }, + { + "epoch": 1.5397489539748954, + "grad_norm": 0.8777225017547607, + "learning_rate": 4.256035119447496e-06, + "loss": 0.6733, + "step": 4171 + }, + { + "epoch": 1.5401181393059316, + "grad_norm": 0.8665995001792908, + "learning_rate": 4.255688899858193e-06, + "loss": 0.6584, + "step": 4172 + }, + { + "epoch": 1.5404873246369677, + "grad_norm": 0.8725627660751343, + "learning_rate": 4.255342613816619e-06, + "loss": 0.6449, + "step": 4173 + }, + { + "epoch": 1.540856509968004, + "grad_norm": 0.8619717359542847, + "learning_rate": 4.2549962613358806e-06, + "loss": 0.6907, + "step": 4174 + }, + { + "epoch": 1.54122569529904, + "grad_norm": 0.8815796375274658, + "learning_rate": 4.254649842429088e-06, + "loss": 0.6925, + "step": 4175 + }, + { + "epoch": 1.5415948806300763, + "grad_norm": 0.879122793674469, + "learning_rate": 4.254303357109353e-06, + "loss": 0.6676, + "step": 4176 + }, + { + "epoch": 1.5419640659611125, + "grad_norm": 0.8658646941184998, + "learning_rate": 4.253956805389789e-06, + "loss": 0.635, + "step": 4177 + }, + { + "epoch": 1.5423332512921486, + "grad_norm": 0.834750235080719, + "learning_rate": 4.253610187283513e-06, + "loss": 0.6331, + "step": 4178 + }, + { + "epoch": 1.542702436623185, + "grad_norm": 0.8679867386817932, + "learning_rate": 4.253263502803646e-06, + "loss": 0.638, + "step": 4179 + }, + { + "epoch": 1.5430716219542209, + "grad_norm": 0.895286500453949, + "learning_rate": 4.2529167519633076e-06, + "loss": 0.6674, + "step": 4180 + }, + { + "epoch": 1.5434408072852572, + "grad_norm": 0.9041076302528381, + "learning_rate": 4.252569934775624e-06, + "loss": 0.6653, + "step": 4181 + }, + { + "epoch": 1.5438099926162934, + "grad_norm": 0.8738065958023071, + "learning_rate": 4.252223051253722e-06, + "loss": 0.6713, + "step": 4182 + }, + { + "epoch": 1.5441791779473295, + "grad_norm": 0.9097195267677307, + "learning_rate": 4.25187610141073e-06, + "loss": 0.6521, + "step": 4183 + }, + { + "epoch": 1.5445483632783659, + "grad_norm": 0.8729567527770996, + "learning_rate": 4.25152908525978e-06, + "loss": 0.6639, + "step": 4184 + }, + { + "epoch": 1.5449175486094018, + "grad_norm": 0.8507500886917114, + "learning_rate": 4.251182002814007e-06, + "loss": 0.6715, + "step": 4185 + }, + { + "epoch": 1.5452867339404381, + "grad_norm": 0.8707184195518494, + "learning_rate": 4.250834854086549e-06, + "loss": 0.6877, + "step": 4186 + }, + { + "epoch": 1.5456559192714743, + "grad_norm": 0.8758911490440369, + "learning_rate": 4.250487639090543e-06, + "loss": 0.6407, + "step": 4187 + }, + { + "epoch": 1.5460251046025104, + "grad_norm": 0.8470941185951233, + "learning_rate": 4.2501403578391335e-06, + "loss": 0.6497, + "step": 4188 + }, + { + "epoch": 1.5463942899335468, + "grad_norm": 0.8853554725646973, + "learning_rate": 4.2497930103454635e-06, + "loss": 0.6578, + "step": 4189 + }, + { + "epoch": 1.5467634752645827, + "grad_norm": 0.8749484419822693, + "learning_rate": 4.2494455966226795e-06, + "loss": 0.6328, + "step": 4190 + }, + { + "epoch": 1.547132660595619, + "grad_norm": 0.8733835220336914, + "learning_rate": 4.2490981166839326e-06, + "loss": 0.6476, + "step": 4191 + }, + { + "epoch": 1.5475018459266552, + "grad_norm": 0.8741841912269592, + "learning_rate": 4.248750570542373e-06, + "loss": 0.668, + "step": 4192 + }, + { + "epoch": 1.5478710312576913, + "grad_norm": 0.8674169778823853, + "learning_rate": 4.2484029582111575e-06, + "loss": 0.6391, + "step": 4193 + }, + { + "epoch": 1.5482402165887277, + "grad_norm": 0.8677988052368164, + "learning_rate": 4.2480552797034415e-06, + "loss": 0.675, + "step": 4194 + }, + { + "epoch": 1.5486094019197636, + "grad_norm": 0.8665788173675537, + "learning_rate": 4.2477075350323845e-06, + "loss": 0.6413, + "step": 4195 + }, + { + "epoch": 1.5489785872508, + "grad_norm": 0.8570698499679565, + "learning_rate": 4.247359724211149e-06, + "loss": 0.6606, + "step": 4196 + }, + { + "epoch": 1.549347772581836, + "grad_norm": 0.8664868474006653, + "learning_rate": 4.247011847252899e-06, + "loss": 0.6799, + "step": 4197 + }, + { + "epoch": 1.5497169579128722, + "grad_norm": 0.8738890290260315, + "learning_rate": 4.246663904170802e-06, + "loss": 0.6705, + "step": 4198 + }, + { + "epoch": 1.5500861432439086, + "grad_norm": 0.8800649046897888, + "learning_rate": 4.246315894978028e-06, + "loss": 0.6357, + "step": 4199 + }, + { + "epoch": 1.5504553285749445, + "grad_norm": 0.8397420644760132, + "learning_rate": 4.245967819687748e-06, + "loss": 0.6538, + "step": 4200 + }, + { + "epoch": 1.5508245139059809, + "grad_norm": 0.8584029078483582, + "learning_rate": 4.245619678313138e-06, + "loss": 0.6534, + "step": 4201 + }, + { + "epoch": 1.551193699237017, + "grad_norm": 0.8632591366767883, + "learning_rate": 4.2452714708673735e-06, + "loss": 0.676, + "step": 4202 + }, + { + "epoch": 1.5515628845680531, + "grad_norm": 0.8554303646087646, + "learning_rate": 4.2449231973636355e-06, + "loss": 0.6978, + "step": 4203 + }, + { + "epoch": 1.5519320698990895, + "grad_norm": 0.8863167762756348, + "learning_rate": 4.244574857815104e-06, + "loss": 0.6678, + "step": 4204 + }, + { + "epoch": 1.5523012552301254, + "grad_norm": 0.877070963382721, + "learning_rate": 4.244226452234965e-06, + "loss": 0.6714, + "step": 4205 + }, + { + "epoch": 1.5526704405611618, + "grad_norm": 0.8823643922805786, + "learning_rate": 4.243877980636405e-06, + "loss": 0.6323, + "step": 4206 + }, + { + "epoch": 1.553039625892198, + "grad_norm": 0.8465237617492676, + "learning_rate": 4.2435294430326145e-06, + "loss": 0.6446, + "step": 4207 + }, + { + "epoch": 1.553408811223234, + "grad_norm": 0.8805181980133057, + "learning_rate": 4.243180839436784e-06, + "loss": 0.6179, + "step": 4208 + }, + { + "epoch": 1.5537779965542704, + "grad_norm": 0.8774869441986084, + "learning_rate": 4.24283216986211e-06, + "loss": 0.7466, + "step": 4209 + }, + { + "epoch": 1.5541471818853063, + "grad_norm": 0.8964880704879761, + "learning_rate": 4.242483434321788e-06, + "loss": 0.6954, + "step": 4210 + }, + { + "epoch": 1.5545163672163427, + "grad_norm": 0.8603606820106506, + "learning_rate": 4.242134632829017e-06, + "loss": 0.6607, + "step": 4211 + }, + { + "epoch": 1.5548855525473788, + "grad_norm": 0.8374791145324707, + "learning_rate": 4.241785765397001e-06, + "loss": 0.616, + "step": 4212 + }, + { + "epoch": 1.555254737878415, + "grad_norm": 0.8593462109565735, + "learning_rate": 4.2414368320389436e-06, + "loss": 0.6563, + "step": 4213 + }, + { + "epoch": 1.5556239232094513, + "grad_norm": 0.8668380975723267, + "learning_rate": 4.241087832768051e-06, + "loss": 0.6585, + "step": 4214 + }, + { + "epoch": 1.5559931085404872, + "grad_norm": 5.635787487030029, + "learning_rate": 4.240738767597534e-06, + "loss": 0.6642, + "step": 4215 + }, + { + "epoch": 1.5563622938715236, + "grad_norm": 0.8751420974731445, + "learning_rate": 4.240389636540604e-06, + "loss": 0.6991, + "step": 4216 + }, + { + "epoch": 1.5567314792025597, + "grad_norm": 0.861203670501709, + "learning_rate": 4.240040439610475e-06, + "loss": 0.6464, + "step": 4217 + }, + { + "epoch": 1.5571006645335959, + "grad_norm": 0.8804810047149658, + "learning_rate": 4.239691176820365e-06, + "loss": 0.6438, + "step": 4218 + }, + { + "epoch": 1.557469849864632, + "grad_norm": 0.8419449329376221, + "learning_rate": 4.239341848183492e-06, + "loss": 0.675, + "step": 4219 + }, + { + "epoch": 1.5578390351956681, + "grad_norm": 0.8441848158836365, + "learning_rate": 4.2389924537130815e-06, + "loss": 0.6335, + "step": 4220 + }, + { + "epoch": 1.5582082205267045, + "grad_norm": 0.8811962008476257, + "learning_rate": 4.238642993422353e-06, + "loss": 0.6648, + "step": 4221 + }, + { + "epoch": 1.5585774058577406, + "grad_norm": 0.8387631773948669, + "learning_rate": 4.238293467324537e-06, + "loss": 0.6651, + "step": 4222 + }, + { + "epoch": 1.5589465911887768, + "grad_norm": 0.898918628692627, + "learning_rate": 4.2379438754328615e-06, + "loss": 0.6375, + "step": 4223 + }, + { + "epoch": 1.559315776519813, + "grad_norm": 0.8734453320503235, + "learning_rate": 4.237594217760559e-06, + "loss": 0.6701, + "step": 4224 + }, + { + "epoch": 1.559684961850849, + "grad_norm": 0.8830417394638062, + "learning_rate": 4.237244494320863e-06, + "loss": 0.6689, + "step": 4225 + }, + { + "epoch": 1.5600541471818854, + "grad_norm": 0.8587502241134644, + "learning_rate": 4.236894705127012e-06, + "loss": 0.6482, + "step": 4226 + }, + { + "epoch": 1.5604233325129215, + "grad_norm": 0.9042527675628662, + "learning_rate": 4.236544850192245e-06, + "loss": 0.6165, + "step": 4227 + }, + { + "epoch": 1.5607925178439577, + "grad_norm": 0.8311232328414917, + "learning_rate": 4.236194929529803e-06, + "loss": 0.6686, + "step": 4228 + }, + { + "epoch": 1.5611617031749938, + "grad_norm": 0.8604593873023987, + "learning_rate": 4.235844943152931e-06, + "loss": 0.6756, + "step": 4229 + }, + { + "epoch": 1.56153088850603, + "grad_norm": 0.9056840538978577, + "learning_rate": 4.235494891074875e-06, + "loss": 0.62, + "step": 4230 + }, + { + "epoch": 1.5619000738370663, + "grad_norm": 0.8579649925231934, + "learning_rate": 4.235144773308886e-06, + "loss": 0.6567, + "step": 4231 + }, + { + "epoch": 1.5622692591681024, + "grad_norm": 0.8643840551376343, + "learning_rate": 4.234794589868214e-06, + "loss": 0.6549, + "step": 4232 + }, + { + "epoch": 1.5626384444991386, + "grad_norm": 0.9008446335792542, + "learning_rate": 4.234444340766115e-06, + "loss": 0.6661, + "step": 4233 + }, + { + "epoch": 1.5630076298301747, + "grad_norm": 0.8837755918502808, + "learning_rate": 4.234094026015845e-06, + "loss": 0.6837, + "step": 4234 + }, + { + "epoch": 1.5633768151612109, + "grad_norm": 0.8758400678634644, + "learning_rate": 4.233743645630664e-06, + "loss": 0.6365, + "step": 4235 + }, + { + "epoch": 1.5637460004922472, + "grad_norm": 0.8377050161361694, + "learning_rate": 4.233393199623832e-06, + "loss": 0.6842, + "step": 4236 + }, + { + "epoch": 1.5641151858232831, + "grad_norm": 0.884593665599823, + "learning_rate": 4.233042688008615e-06, + "loss": 0.6761, + "step": 4237 + }, + { + "epoch": 1.5644843711543195, + "grad_norm": 0.9407088756561279, + "learning_rate": 4.23269211079828e-06, + "loss": 0.6251, + "step": 4238 + }, + { + "epoch": 1.5648535564853556, + "grad_norm": 0.8532242774963379, + "learning_rate": 4.232341468006095e-06, + "loss": 0.6532, + "step": 4239 + }, + { + "epoch": 1.5652227418163918, + "grad_norm": 0.8671936988830566, + "learning_rate": 4.231990759645331e-06, + "loss": 0.6332, + "step": 4240 + }, + { + "epoch": 1.5655919271474281, + "grad_norm": 0.8622744679450989, + "learning_rate": 4.231639985729265e-06, + "loss": 0.6391, + "step": 4241 + }, + { + "epoch": 1.565961112478464, + "grad_norm": 0.8656474351882935, + "learning_rate": 4.231289146271171e-06, + "loss": 0.6501, + "step": 4242 + }, + { + "epoch": 1.5663302978095004, + "grad_norm": 1.0257892608642578, + "learning_rate": 4.230938241284329e-06, + "loss": 0.6722, + "step": 4243 + }, + { + "epoch": 1.5666994831405365, + "grad_norm": 0.8441953659057617, + "learning_rate": 4.230587270782022e-06, + "loss": 0.6743, + "step": 4244 + }, + { + "epoch": 1.5670686684715727, + "grad_norm": 0.8657674193382263, + "learning_rate": 4.230236234777532e-06, + "loss": 0.6778, + "step": 4245 + }, + { + "epoch": 1.567437853802609, + "grad_norm": 0.849566638469696, + "learning_rate": 4.229885133284147e-06, + "loss": 0.672, + "step": 4246 + }, + { + "epoch": 1.567807039133645, + "grad_norm": 0.884539008140564, + "learning_rate": 4.229533966315156e-06, + "loss": 0.6541, + "step": 4247 + }, + { + "epoch": 1.5681762244646813, + "grad_norm": 0.8398969173431396, + "learning_rate": 4.22918273388385e-06, + "loss": 0.6447, + "step": 4248 + }, + { + "epoch": 1.5685454097957174, + "grad_norm": 0.893461287021637, + "learning_rate": 4.228831436003522e-06, + "loss": 0.6543, + "step": 4249 + }, + { + "epoch": 1.5689145951267536, + "grad_norm": 0.8948325514793396, + "learning_rate": 4.2284800726874705e-06, + "loss": 0.6729, + "step": 4250 + }, + { + "epoch": 1.56928378045779, + "grad_norm": 0.8415946960449219, + "learning_rate": 4.228128643948995e-06, + "loss": 0.6806, + "step": 4251 + }, + { + "epoch": 1.5696529657888258, + "grad_norm": 0.8580443263053894, + "learning_rate": 4.227777149801394e-06, + "loss": 0.6727, + "step": 4252 + }, + { + "epoch": 1.5700221511198622, + "grad_norm": 0.8802024722099304, + "learning_rate": 4.227425590257975e-06, + "loss": 0.6541, + "step": 4253 + }, + { + "epoch": 1.5703913364508983, + "grad_norm": 0.853466272354126, + "learning_rate": 4.227073965332041e-06, + "loss": 0.6616, + "step": 4254 + }, + { + "epoch": 1.5707605217819345, + "grad_norm": 0.8916603326797485, + "learning_rate": 4.226722275036904e-06, + "loss": 0.6725, + "step": 4255 + }, + { + "epoch": 1.5711297071129708, + "grad_norm": 0.8652102947235107, + "learning_rate": 4.226370519385873e-06, + "loss": 0.672, + "step": 4256 + }, + { + "epoch": 1.5714988924440068, + "grad_norm": 0.8541145324707031, + "learning_rate": 4.226018698392262e-06, + "loss": 0.6489, + "step": 4257 + }, + { + "epoch": 1.5718680777750431, + "grad_norm": 0.9183180332183838, + "learning_rate": 4.225666812069389e-06, + "loss": 0.659, + "step": 4258 + }, + { + "epoch": 1.5722372631060793, + "grad_norm": 0.8452380299568176, + "learning_rate": 4.2253148604305715e-06, + "loss": 0.616, + "step": 4259 + }, + { + "epoch": 1.5726064484371154, + "grad_norm": 0.8150725960731506, + "learning_rate": 4.224962843489131e-06, + "loss": 0.6278, + "step": 4260 + }, + { + "epoch": 1.5729756337681517, + "grad_norm": 0.8685572147369385, + "learning_rate": 4.2246107612583905e-06, + "loss": 0.6455, + "step": 4261 + }, + { + "epoch": 1.5733448190991877, + "grad_norm": 0.879389762878418, + "learning_rate": 4.224258613751678e-06, + "loss": 0.6625, + "step": 4262 + }, + { + "epoch": 1.573714004430224, + "grad_norm": 0.8555986285209656, + "learning_rate": 4.223906400982322e-06, + "loss": 0.7186, + "step": 4263 + }, + { + "epoch": 1.5740831897612602, + "grad_norm": 0.8891647458076477, + "learning_rate": 4.2235541229636515e-06, + "loss": 0.6683, + "step": 4264 + }, + { + "epoch": 1.5744523750922963, + "grad_norm": 0.8505069017410278, + "learning_rate": 4.223201779709002e-06, + "loss": 0.658, + "step": 4265 + }, + { + "epoch": 1.5748215604233327, + "grad_norm": 0.8588441610336304, + "learning_rate": 4.222849371231708e-06, + "loss": 0.6254, + "step": 4266 + }, + { + "epoch": 1.5751907457543686, + "grad_norm": 0.8751015663146973, + "learning_rate": 4.22249689754511e-06, + "loss": 0.6863, + "step": 4267 + }, + { + "epoch": 1.575559931085405, + "grad_norm": 0.8835867047309875, + "learning_rate": 4.2221443586625474e-06, + "loss": 0.6375, + "step": 4268 + }, + { + "epoch": 1.575929116416441, + "grad_norm": 0.870347261428833, + "learning_rate": 4.221791754597364e-06, + "loss": 0.6313, + "step": 4269 + }, + { + "epoch": 1.5762983017474772, + "grad_norm": 0.8764151930809021, + "learning_rate": 4.221439085362907e-06, + "loss": 0.6308, + "step": 4270 + }, + { + "epoch": 1.5766674870785136, + "grad_norm": 0.8521614074707031, + "learning_rate": 4.221086350972524e-06, + "loss": 0.6859, + "step": 4271 + }, + { + "epoch": 1.5770366724095495, + "grad_norm": 0.871642529964447, + "learning_rate": 4.2207335514395656e-06, + "loss": 0.6897, + "step": 4272 + }, + { + "epoch": 1.5774058577405858, + "grad_norm": 0.8739179968833923, + "learning_rate": 4.220380686777385e-06, + "loss": 0.6818, + "step": 4273 + }, + { + "epoch": 1.577775043071622, + "grad_norm": 0.873530387878418, + "learning_rate": 4.2200277569993385e-06, + "loss": 0.6584, + "step": 4274 + }, + { + "epoch": 1.5781442284026581, + "grad_norm": 0.8790942430496216, + "learning_rate": 4.219674762118785e-06, + "loss": 0.6428, + "step": 4275 + }, + { + "epoch": 1.5785134137336945, + "grad_norm": 0.8489995002746582, + "learning_rate": 4.2193217021490835e-06, + "loss": 0.6409, + "step": 4276 + }, + { + "epoch": 1.5788825990647304, + "grad_norm": 0.871968686580658, + "learning_rate": 4.2189685771035985e-06, + "loss": 0.7051, + "step": 4277 + }, + { + "epoch": 1.5792517843957667, + "grad_norm": 0.8382906317710876, + "learning_rate": 4.218615386995695e-06, + "loss": 0.6493, + "step": 4278 + }, + { + "epoch": 1.5796209697268029, + "grad_norm": 0.861322283744812, + "learning_rate": 4.218262131838742e-06, + "loss": 0.6618, + "step": 4279 + }, + { + "epoch": 1.579990155057839, + "grad_norm": 0.8830376863479614, + "learning_rate": 4.21790881164611e-06, + "loss": 0.6639, + "step": 4280 + }, + { + "epoch": 1.5803593403888754, + "grad_norm": 0.9060579538345337, + "learning_rate": 4.217555426431171e-06, + "loss": 0.6829, + "step": 4281 + }, + { + "epoch": 1.5807285257199113, + "grad_norm": 0.8879520893096924, + "learning_rate": 4.217201976207302e-06, + "loss": 0.7002, + "step": 4282 + }, + { + "epoch": 1.5810977110509477, + "grad_norm": 0.8755478262901306, + "learning_rate": 4.216848460987879e-06, + "loss": 0.6459, + "step": 4283 + }, + { + "epoch": 1.5814668963819838, + "grad_norm": 0.86644446849823, + "learning_rate": 4.216494880786284e-06, + "loss": 0.6503, + "step": 4284 + }, + { + "epoch": 1.58183608171302, + "grad_norm": 0.8244695663452148, + "learning_rate": 4.2161412356159e-06, + "loss": 0.6778, + "step": 4285 + }, + { + "epoch": 1.582205267044056, + "grad_norm": 0.8316506743431091, + "learning_rate": 4.215787525490111e-06, + "loss": 0.6408, + "step": 4286 + }, + { + "epoch": 1.5825744523750922, + "grad_norm": 0.8701819181442261, + "learning_rate": 4.215433750422306e-06, + "loss": 0.6787, + "step": 4287 + }, + { + "epoch": 1.5829436377061286, + "grad_norm": 0.8805943131446838, + "learning_rate": 4.215079910425875e-06, + "loss": 0.7311, + "step": 4288 + }, + { + "epoch": 1.5833128230371647, + "grad_norm": 0.8949970006942749, + "learning_rate": 4.214726005514211e-06, + "loss": 0.694, + "step": 4289 + }, + { + "epoch": 1.5836820083682008, + "grad_norm": 0.8912908434867859, + "learning_rate": 4.214372035700708e-06, + "loss": 0.6945, + "step": 4290 + }, + { + "epoch": 1.584051193699237, + "grad_norm": 0.9008890986442566, + "learning_rate": 4.214018000998765e-06, + "loss": 0.6732, + "step": 4291 + }, + { + "epoch": 1.584420379030273, + "grad_norm": 0.8636583685874939, + "learning_rate": 4.213663901421782e-06, + "loss": 0.6324, + "step": 4292 + }, + { + "epoch": 1.5847895643613095, + "grad_norm": 0.8475031852722168, + "learning_rate": 4.213309736983161e-06, + "loss": 0.6883, + "step": 4293 + }, + { + "epoch": 1.5851587496923456, + "grad_norm": 0.8843541145324707, + "learning_rate": 4.212955507696307e-06, + "loss": 0.6377, + "step": 4294 + }, + { + "epoch": 1.5855279350233817, + "grad_norm": 0.8340337872505188, + "learning_rate": 4.212601213574627e-06, + "loss": 0.6416, + "step": 4295 + }, + { + "epoch": 1.5858971203544179, + "grad_norm": 0.8323838114738464, + "learning_rate": 4.212246854631533e-06, + "loss": 0.6508, + "step": 4296 + }, + { + "epoch": 1.586266305685454, + "grad_norm": 0.8633334636688232, + "learning_rate": 4.211892430880436e-06, + "loss": 0.6754, + "step": 4297 + }, + { + "epoch": 1.5866354910164904, + "grad_norm": 0.8896973133087158, + "learning_rate": 4.211537942334749e-06, + "loss": 0.6748, + "step": 4298 + }, + { + "epoch": 1.5870046763475265, + "grad_norm": 0.8369522094726562, + "learning_rate": 4.211183389007892e-06, + "loss": 0.6277, + "step": 4299 + }, + { + "epoch": 1.5873738616785626, + "grad_norm": 0.845987856388092, + "learning_rate": 4.210828770913285e-06, + "loss": 0.6331, + "step": 4300 + }, + { + "epoch": 1.5877430470095988, + "grad_norm": 0.8744791150093079, + "learning_rate": 4.210474088064347e-06, + "loss": 0.6226, + "step": 4301 + }, + { + "epoch": 1.588112232340635, + "grad_norm": 0.8387669920921326, + "learning_rate": 4.210119340474505e-06, + "loss": 0.6426, + "step": 4302 + }, + { + "epoch": 1.5884814176716713, + "grad_norm": 0.8427000641822815, + "learning_rate": 4.209764528157187e-06, + "loss": 0.6618, + "step": 4303 + }, + { + "epoch": 1.5888506030027072, + "grad_norm": 0.8573931455612183, + "learning_rate": 4.20940965112582e-06, + "loss": 0.6573, + "step": 4304 + }, + { + "epoch": 1.5892197883337436, + "grad_norm": 0.8822378516197205, + "learning_rate": 4.209054709393838e-06, + "loss": 0.6779, + "step": 4305 + }, + { + "epoch": 1.5895889736647797, + "grad_norm": 0.8746011257171631, + "learning_rate": 4.208699702974673e-06, + "loss": 0.6454, + "step": 4306 + }, + { + "epoch": 1.5899581589958158, + "grad_norm": 0.8783186078071594, + "learning_rate": 4.208344631881765e-06, + "loss": 0.6296, + "step": 4307 + }, + { + "epoch": 1.5903273443268522, + "grad_norm": 0.8214596509933472, + "learning_rate": 4.207989496128552e-06, + "loss": 0.6493, + "step": 4308 + }, + { + "epoch": 1.590696529657888, + "grad_norm": 0.8744916915893555, + "learning_rate": 4.207634295728475e-06, + "loss": 0.6415, + "step": 4309 + }, + { + "epoch": 1.5910657149889245, + "grad_norm": 0.8120558261871338, + "learning_rate": 4.207279030694979e-06, + "loss": 0.6314, + "step": 4310 + }, + { + "epoch": 1.5914349003199606, + "grad_norm": 0.8672664761543274, + "learning_rate": 4.206923701041511e-06, + "loss": 0.6633, + "step": 4311 + }, + { + "epoch": 1.5918040856509967, + "grad_norm": 0.8859631419181824, + "learning_rate": 4.206568306781519e-06, + "loss": 0.693, + "step": 4312 + }, + { + "epoch": 1.592173270982033, + "grad_norm": 0.887881875038147, + "learning_rate": 4.206212847928455e-06, + "loss": 0.6981, + "step": 4313 + }, + { + "epoch": 1.592542456313069, + "grad_norm": 0.9100503921508789, + "learning_rate": 4.205857324495774e-06, + "loss": 0.7014, + "step": 4314 + }, + { + "epoch": 1.5929116416441054, + "grad_norm": 0.8759715557098389, + "learning_rate": 4.205501736496931e-06, + "loss": 0.6562, + "step": 4315 + }, + { + "epoch": 1.5932808269751415, + "grad_norm": 0.8819805383682251, + "learning_rate": 4.2051460839453855e-06, + "loss": 0.6903, + "step": 4316 + }, + { + "epoch": 1.5936500123061776, + "grad_norm": 0.8690140843391418, + "learning_rate": 4.2047903668545994e-06, + "loss": 0.645, + "step": 4317 + }, + { + "epoch": 1.594019197637214, + "grad_norm": 0.8905983567237854, + "learning_rate": 4.204434585238035e-06, + "loss": 0.6411, + "step": 4318 + }, + { + "epoch": 1.59438838296825, + "grad_norm": 0.8634137511253357, + "learning_rate": 4.204078739109161e-06, + "loss": 0.6728, + "step": 4319 + }, + { + "epoch": 1.5947575682992863, + "grad_norm": 0.8836909532546997, + "learning_rate": 4.203722828481443e-06, + "loss": 0.6436, + "step": 4320 + }, + { + "epoch": 1.5951267536303224, + "grad_norm": 0.8512603640556335, + "learning_rate": 4.2033668533683545e-06, + "loss": 0.6573, + "step": 4321 + }, + { + "epoch": 1.5954959389613586, + "grad_norm": 0.8892917037010193, + "learning_rate": 4.203010813783368e-06, + "loss": 0.652, + "step": 4322 + }, + { + "epoch": 1.595865124292395, + "grad_norm": 0.8665977716445923, + "learning_rate": 4.20265470973996e-06, + "loss": 0.658, + "step": 4323 + }, + { + "epoch": 1.5962343096234308, + "grad_norm": 0.8783267140388489, + "learning_rate": 4.202298541251608e-06, + "loss": 0.6714, + "step": 4324 + }, + { + "epoch": 1.5966034949544672, + "grad_norm": 0.9353106617927551, + "learning_rate": 4.201942308331793e-06, + "loss": 0.6816, + "step": 4325 + }, + { + "epoch": 1.5969726802855033, + "grad_norm": 0.8760313987731934, + "learning_rate": 4.2015860109939995e-06, + "loss": 0.6718, + "step": 4326 + }, + { + "epoch": 1.5973418656165395, + "grad_norm": 0.9170013666152954, + "learning_rate": 4.201229649251714e-06, + "loss": 0.6554, + "step": 4327 + }, + { + "epoch": 1.5977110509475758, + "grad_norm": 0.8757601976394653, + "learning_rate": 4.200873223118421e-06, + "loss": 0.7011, + "step": 4328 + }, + { + "epoch": 1.5980802362786117, + "grad_norm": 0.8628302216529846, + "learning_rate": 4.200516732607614e-06, + "loss": 0.6884, + "step": 4329 + }, + { + "epoch": 1.598449421609648, + "grad_norm": 0.8542652726173401, + "learning_rate": 4.200160177732785e-06, + "loss": 0.6466, + "step": 4330 + }, + { + "epoch": 1.5988186069406842, + "grad_norm": 0.8923056721687317, + "learning_rate": 4.199803558507431e-06, + "loss": 0.665, + "step": 4331 + }, + { + "epoch": 1.5991877922717204, + "grad_norm": 0.8479766845703125, + "learning_rate": 4.199446874945048e-06, + "loss": 0.6562, + "step": 4332 + }, + { + "epoch": 1.5995569776027567, + "grad_norm": 0.8462057709693909, + "learning_rate": 4.199090127059137e-06, + "loss": 0.6493, + "step": 4333 + }, + { + "epoch": 1.5999261629337926, + "grad_norm": 0.8644058108329773, + "learning_rate": 4.198733314863201e-06, + "loss": 0.6826, + "step": 4334 + }, + { + "epoch": 1.600295348264829, + "grad_norm": 0.8548464179039001, + "learning_rate": 4.198376438370745e-06, + "loss": 0.669, + "step": 4335 + }, + { + "epoch": 1.6006645335958651, + "grad_norm": 0.8514336943626404, + "learning_rate": 4.198019497595277e-06, + "loss": 0.6515, + "step": 4336 + }, + { + "epoch": 1.6010337189269013, + "grad_norm": 0.8964693546295166, + "learning_rate": 4.1976624925503075e-06, + "loss": 0.6494, + "step": 4337 + }, + { + "epoch": 1.6014029042579376, + "grad_norm": 0.8826411962509155, + "learning_rate": 4.197305423249347e-06, + "loss": 0.6481, + "step": 4338 + }, + { + "epoch": 1.6017720895889735, + "grad_norm": 0.889150083065033, + "learning_rate": 4.196948289705914e-06, + "loss": 0.6578, + "step": 4339 + }, + { + "epoch": 1.60214127492001, + "grad_norm": 0.8682560324668884, + "learning_rate": 4.1965910919335215e-06, + "loss": 0.6122, + "step": 4340 + }, + { + "epoch": 1.602510460251046, + "grad_norm": 0.8223267793655396, + "learning_rate": 4.196233829945693e-06, + "loss": 0.6434, + "step": 4341 + }, + { + "epoch": 1.6028796455820822, + "grad_norm": 0.8587920665740967, + "learning_rate": 4.195876503755949e-06, + "loss": 0.6542, + "step": 4342 + }, + { + "epoch": 1.6032488309131185, + "grad_norm": 0.8731874823570251, + "learning_rate": 4.195519113377815e-06, + "loss": 0.6692, + "step": 4343 + }, + { + "epoch": 1.6036180162441545, + "grad_norm": 0.9026461243629456, + "learning_rate": 4.1951616588248175e-06, + "loss": 0.6311, + "step": 4344 + }, + { + "epoch": 1.6039872015751908, + "grad_norm": 0.8386043310165405, + "learning_rate": 4.1948041401104855e-06, + "loss": 0.6313, + "step": 4345 + }, + { + "epoch": 1.604356386906227, + "grad_norm": 0.8703342080116272, + "learning_rate": 4.194446557248353e-06, + "loss": 0.643, + "step": 4346 + }, + { + "epoch": 1.604725572237263, + "grad_norm": 0.8721659183502197, + "learning_rate": 4.1940889102519525e-06, + "loss": 0.6591, + "step": 4347 + }, + { + "epoch": 1.6050947575682994, + "grad_norm": 0.895194947719574, + "learning_rate": 4.193731199134821e-06, + "loss": 0.6577, + "step": 4348 + }, + { + "epoch": 1.6054639428993354, + "grad_norm": 0.8768149018287659, + "learning_rate": 4.193373423910499e-06, + "loss": 0.672, + "step": 4349 + }, + { + "epoch": 1.6058331282303717, + "grad_norm": 0.8727839589118958, + "learning_rate": 4.193015584592526e-06, + "loss": 0.662, + "step": 4350 + }, + { + "epoch": 1.6062023135614079, + "grad_norm": 0.8923423886299133, + "learning_rate": 4.192657681194449e-06, + "loss": 0.6837, + "step": 4351 + }, + { + "epoch": 1.606571498892444, + "grad_norm": 0.8899692296981812, + "learning_rate": 4.192299713729812e-06, + "loss": 0.6547, + "step": 4352 + }, + { + "epoch": 1.6069406842234804, + "grad_norm": 0.8503372669219971, + "learning_rate": 4.191941682212165e-06, + "loss": 0.6346, + "step": 4353 + }, + { + "epoch": 1.6073098695545163, + "grad_norm": 0.8508785963058472, + "learning_rate": 4.191583586655059e-06, + "loss": 0.6823, + "step": 4354 + }, + { + "epoch": 1.6076790548855526, + "grad_norm": 0.8549250960350037, + "learning_rate": 4.191225427072048e-06, + "loss": 0.6934, + "step": 4355 + }, + { + "epoch": 1.6080482402165888, + "grad_norm": 0.8571179509162903, + "learning_rate": 4.1908672034766895e-06, + "loss": 0.6819, + "step": 4356 + }, + { + "epoch": 1.608417425547625, + "grad_norm": 0.843822717666626, + "learning_rate": 4.190508915882541e-06, + "loss": 0.6333, + "step": 4357 + }, + { + "epoch": 1.608786610878661, + "grad_norm": 0.861970841884613, + "learning_rate": 4.190150564303162e-06, + "loss": 0.6511, + "step": 4358 + }, + { + "epoch": 1.6091557962096972, + "grad_norm": 0.9087755084037781, + "learning_rate": 4.189792148752119e-06, + "loss": 0.6391, + "step": 4359 + }, + { + "epoch": 1.6095249815407335, + "grad_norm": 0.851084291934967, + "learning_rate": 4.189433669242976e-06, + "loss": 0.6312, + "step": 4360 + }, + { + "epoch": 1.6098941668717697, + "grad_norm": 0.8654747009277344, + "learning_rate": 4.1890751257893014e-06, + "loss": 0.6331, + "step": 4361 + }, + { + "epoch": 1.6102633522028058, + "grad_norm": 0.8624576926231384, + "learning_rate": 4.188716518404668e-06, + "loss": 0.6444, + "step": 4362 + }, + { + "epoch": 1.610632537533842, + "grad_norm": 0.8917809128761292, + "learning_rate": 4.188357847102645e-06, + "loss": 0.6361, + "step": 4363 + }, + { + "epoch": 1.611001722864878, + "grad_norm": 0.8862797617912292, + "learning_rate": 4.187999111896811e-06, + "loss": 0.693, + "step": 4364 + }, + { + "epoch": 1.6113709081959144, + "grad_norm": 0.8763318657875061, + "learning_rate": 4.187640312800745e-06, + "loss": 0.6647, + "step": 4365 + }, + { + "epoch": 1.6117400935269506, + "grad_norm": 0.8834691047668457, + "learning_rate": 4.187281449828024e-06, + "loss": 0.6733, + "step": 4366 + }, + { + "epoch": 1.6121092788579867, + "grad_norm": 0.911658763885498, + "learning_rate": 4.186922522992234e-06, + "loss": 0.6661, + "step": 4367 + }, + { + "epoch": 1.6124784641890229, + "grad_norm": 0.897529661655426, + "learning_rate": 4.186563532306957e-06, + "loss": 0.6415, + "step": 4368 + }, + { + "epoch": 1.612847649520059, + "grad_norm": 0.870319664478302, + "learning_rate": 4.186204477785784e-06, + "loss": 0.6416, + "step": 4369 + }, + { + "epoch": 1.6132168348510953, + "grad_norm": 0.8577300906181335, + "learning_rate": 4.185845359442303e-06, + "loss": 0.6437, + "step": 4370 + }, + { + "epoch": 1.6135860201821313, + "grad_norm": 0.8702265024185181, + "learning_rate": 4.185486177290108e-06, + "loss": 0.6625, + "step": 4371 + }, + { + "epoch": 1.6139552055131676, + "grad_norm": 0.8731698393821716, + "learning_rate": 4.185126931342793e-06, + "loss": 0.63, + "step": 4372 + }, + { + "epoch": 1.6143243908442038, + "grad_norm": 0.8518331050872803, + "learning_rate": 4.184767621613955e-06, + "loss": 0.6719, + "step": 4373 + }, + { + "epoch": 1.61469357617524, + "grad_norm": 0.887959361076355, + "learning_rate": 4.184408248117194e-06, + "loss": 0.653, + "step": 4374 + }, + { + "epoch": 1.6150627615062763, + "grad_norm": 0.8678539395332336, + "learning_rate": 4.184048810866114e-06, + "loss": 0.6059, + "step": 4375 + }, + { + "epoch": 1.6154319468373122, + "grad_norm": 0.8768351078033447, + "learning_rate": 4.183689309874317e-06, + "loss": 0.6474, + "step": 4376 + }, + { + "epoch": 1.6158011321683485, + "grad_norm": 0.8531150817871094, + "learning_rate": 4.18332974515541e-06, + "loss": 0.6287, + "step": 4377 + }, + { + "epoch": 1.6161703174993847, + "grad_norm": 0.9014023542404175, + "learning_rate": 4.1829701167230045e-06, + "loss": 0.6374, + "step": 4378 + }, + { + "epoch": 1.6165395028304208, + "grad_norm": 0.8881750106811523, + "learning_rate": 4.1826104245907116e-06, + "loss": 0.7131, + "step": 4379 + }, + { + "epoch": 1.6169086881614572, + "grad_norm": 0.917102038860321, + "learning_rate": 4.182250668772144e-06, + "loss": 0.6634, + "step": 4380 + }, + { + "epoch": 1.617277873492493, + "grad_norm": 0.8943058848381042, + "learning_rate": 4.181890849280921e-06, + "loss": 0.6764, + "step": 4381 + }, + { + "epoch": 1.6176470588235294, + "grad_norm": 0.8618694543838501, + "learning_rate": 4.181530966130659e-06, + "loss": 0.6176, + "step": 4382 + }, + { + "epoch": 1.6180162441545656, + "grad_norm": 0.8253743648529053, + "learning_rate": 4.181171019334981e-06, + "loss": 0.6659, + "step": 4383 + }, + { + "epoch": 1.6183854294856017, + "grad_norm": 0.8890605568885803, + "learning_rate": 4.18081100890751e-06, + "loss": 0.6606, + "step": 4384 + }, + { + "epoch": 1.618754614816638, + "grad_norm": 0.8632179498672485, + "learning_rate": 4.180450934861874e-06, + "loss": 0.6493, + "step": 4385 + }, + { + "epoch": 1.619123800147674, + "grad_norm": 0.8663358092308044, + "learning_rate": 4.180090797211699e-06, + "loss": 0.6513, + "step": 4386 + }, + { + "epoch": 1.6194929854787103, + "grad_norm": 0.8597458004951477, + "learning_rate": 4.179730595970618e-06, + "loss": 0.658, + "step": 4387 + }, + { + "epoch": 1.6198621708097465, + "grad_norm": 0.8524882197380066, + "learning_rate": 4.1793703311522646e-06, + "loss": 0.6169, + "step": 4388 + }, + { + "epoch": 1.6202313561407826, + "grad_norm": 0.8515695929527283, + "learning_rate": 4.179010002770274e-06, + "loss": 0.6469, + "step": 4389 + }, + { + "epoch": 1.620600541471819, + "grad_norm": 0.8857511878013611, + "learning_rate": 4.178649610838284e-06, + "loss": 0.6715, + "step": 4390 + }, + { + "epoch": 1.620969726802855, + "grad_norm": 0.8813033699989319, + "learning_rate": 4.1782891553699374e-06, + "loss": 0.6432, + "step": 4391 + }, + { + "epoch": 1.6213389121338913, + "grad_norm": 0.8559392094612122, + "learning_rate": 4.177928636378875e-06, + "loss": 0.6517, + "step": 4392 + }, + { + "epoch": 1.6217080974649274, + "grad_norm": 0.9965197443962097, + "learning_rate": 4.177568053878743e-06, + "loss": 0.6448, + "step": 4393 + }, + { + "epoch": 1.6220772827959635, + "grad_norm": 0.876235842704773, + "learning_rate": 4.177207407883191e-06, + "loss": 0.6492, + "step": 4394 + }, + { + "epoch": 1.6224464681269999, + "grad_norm": 0.8454791903495789, + "learning_rate": 4.176846698405867e-06, + "loss": 0.6683, + "step": 4395 + }, + { + "epoch": 1.6228156534580358, + "grad_norm": 0.8627314567565918, + "learning_rate": 4.176485925460424e-06, + "loss": 0.6365, + "step": 4396 + }, + { + "epoch": 1.6231848387890722, + "grad_norm": 0.8540224432945251, + "learning_rate": 4.176125089060518e-06, + "loss": 0.6283, + "step": 4397 + }, + { + "epoch": 1.6235540241201083, + "grad_norm": 0.8493157029151917, + "learning_rate": 4.175764189219808e-06, + "loss": 0.6482, + "step": 4398 + }, + { + "epoch": 1.6239232094511444, + "grad_norm": 0.8832751512527466, + "learning_rate": 4.1754032259519516e-06, + "loss": 0.6398, + "step": 4399 + }, + { + "epoch": 1.6242923947821808, + "grad_norm": 0.8519330620765686, + "learning_rate": 4.175042199270611e-06, + "loss": 0.6487, + "step": 4400 + }, + { + "epoch": 1.6246615801132167, + "grad_norm": 0.8589913249015808, + "learning_rate": 4.1746811091894525e-06, + "loss": 0.6547, + "step": 4401 + }, + { + "epoch": 1.625030765444253, + "grad_norm": 0.8730244636535645, + "learning_rate": 4.174319955722143e-06, + "loss": 0.6388, + "step": 4402 + }, + { + "epoch": 1.6253999507752892, + "grad_norm": 0.9142640233039856, + "learning_rate": 4.17395873888235e-06, + "loss": 0.6895, + "step": 4403 + }, + { + "epoch": 1.6257691361063253, + "grad_norm": 0.8779181838035583, + "learning_rate": 4.173597458683748e-06, + "loss": 0.6472, + "step": 4404 + }, + { + "epoch": 1.6261383214373617, + "grad_norm": 0.8999330997467041, + "learning_rate": 4.173236115140011e-06, + "loss": 0.6374, + "step": 4405 + }, + { + "epoch": 1.6265075067683976, + "grad_norm": 0.8564229011535645, + "learning_rate": 4.172874708264814e-06, + "loss": 0.6497, + "step": 4406 + }, + { + "epoch": 1.626876692099434, + "grad_norm": 0.8747360706329346, + "learning_rate": 4.172513238071838e-06, + "loss": 0.6629, + "step": 4407 + }, + { + "epoch": 1.6272458774304701, + "grad_norm": 0.8672546148300171, + "learning_rate": 4.172151704574763e-06, + "loss": 0.665, + "step": 4408 + }, + { + "epoch": 1.6276150627615062, + "grad_norm": 0.8409205675125122, + "learning_rate": 4.171790107787275e-06, + "loss": 0.6585, + "step": 4409 + }, + { + "epoch": 1.6279842480925426, + "grad_norm": 0.9315345883369446, + "learning_rate": 4.171428447723059e-06, + "loss": 0.6302, + "step": 4410 + }, + { + "epoch": 1.6283534334235785, + "grad_norm": 0.8946703672409058, + "learning_rate": 4.171066724395802e-06, + "loss": 0.6613, + "step": 4411 + }, + { + "epoch": 1.6287226187546149, + "grad_norm": 0.8571600914001465, + "learning_rate": 4.170704937819198e-06, + "loss": 0.628, + "step": 4412 + }, + { + "epoch": 1.629091804085651, + "grad_norm": 0.8596149682998657, + "learning_rate": 4.17034308800694e-06, + "loss": 0.6816, + "step": 4413 + }, + { + "epoch": 1.6294609894166872, + "grad_norm": 0.886307418346405, + "learning_rate": 4.169981174972723e-06, + "loss": 0.6469, + "step": 4414 + }, + { + "epoch": 1.6298301747477235, + "grad_norm": 0.8935983180999756, + "learning_rate": 4.169619198730244e-06, + "loss": 0.6719, + "step": 4415 + }, + { + "epoch": 1.6301993600787594, + "grad_norm": 0.8672124743461609, + "learning_rate": 4.1692571592932066e-06, + "loss": 0.6327, + "step": 4416 + }, + { + "epoch": 1.6305685454097958, + "grad_norm": 0.8690403699874878, + "learning_rate": 4.168895056675313e-06, + "loss": 0.6649, + "step": 4417 + }, + { + "epoch": 1.630937730740832, + "grad_norm": 0.8727594614028931, + "learning_rate": 4.168532890890268e-06, + "loss": 0.6578, + "step": 4418 + }, + { + "epoch": 1.631306916071868, + "grad_norm": 0.8690437078475952, + "learning_rate": 4.168170661951779e-06, + "loss": 0.6775, + "step": 4419 + }, + { + "epoch": 1.6316761014029044, + "grad_norm": 0.8844609260559082, + "learning_rate": 4.167808369873556e-06, + "loss": 0.6716, + "step": 4420 + }, + { + "epoch": 1.6320452867339403, + "grad_norm": 0.8707059621810913, + "learning_rate": 4.167446014669315e-06, + "loss": 0.6618, + "step": 4421 + }, + { + "epoch": 1.6324144720649767, + "grad_norm": 0.827874481678009, + "learning_rate": 4.167083596352767e-06, + "loss": 0.6759, + "step": 4422 + }, + { + "epoch": 1.6327836573960128, + "grad_norm": 0.8631847500801086, + "learning_rate": 4.166721114937632e-06, + "loss": 0.6524, + "step": 4423 + }, + { + "epoch": 1.633152842727049, + "grad_norm": 0.8543108701705933, + "learning_rate": 4.166358570437628e-06, + "loss": 0.633, + "step": 4424 + }, + { + "epoch": 1.633522028058085, + "grad_norm": 0.913436770439148, + "learning_rate": 4.165995962866478e-06, + "loss": 0.6349, + "step": 4425 + }, + { + "epoch": 1.6338912133891212, + "grad_norm": 0.8641530871391296, + "learning_rate": 4.1656332922379075e-06, + "loss": 0.6473, + "step": 4426 + }, + { + "epoch": 1.6342603987201576, + "grad_norm": 0.8774133324623108, + "learning_rate": 4.165270558565642e-06, + "loss": 0.6445, + "step": 4427 + }, + { + "epoch": 1.6346295840511937, + "grad_norm": 1.0843048095703125, + "learning_rate": 4.1649077618634106e-06, + "loss": 0.6353, + "step": 4428 + }, + { + "epoch": 1.6349987693822299, + "grad_norm": 0.8785996437072754, + "learning_rate": 4.164544902144947e-06, + "loss": 0.6543, + "step": 4429 + }, + { + "epoch": 1.635367954713266, + "grad_norm": 0.8553756475448608, + "learning_rate": 4.164181979423984e-06, + "loss": 0.603, + "step": 4430 + }, + { + "epoch": 1.6357371400443022, + "grad_norm": 0.8529859185218811, + "learning_rate": 4.1638189937142584e-06, + "loss": 0.602, + "step": 4431 + }, + { + "epoch": 1.6361063253753385, + "grad_norm": 0.8969563841819763, + "learning_rate": 4.163455945029509e-06, + "loss": 0.6538, + "step": 4432 + }, + { + "epoch": 1.6364755107063746, + "grad_norm": 0.8813667893409729, + "learning_rate": 4.163092833383476e-06, + "loss": 0.6716, + "step": 4433 + }, + { + "epoch": 1.6368446960374108, + "grad_norm": 0.8763986229896545, + "learning_rate": 4.162729658789905e-06, + "loss": 0.6205, + "step": 4434 + }, + { + "epoch": 1.637213881368447, + "grad_norm": 0.8710798025131226, + "learning_rate": 4.1623664212625415e-06, + "loss": 0.66, + "step": 4435 + }, + { + "epoch": 1.637583066699483, + "grad_norm": 0.9128121137619019, + "learning_rate": 4.162003120815133e-06, + "loss": 0.6494, + "step": 4436 + }, + { + "epoch": 1.6379522520305194, + "grad_norm": 0.8682630062103271, + "learning_rate": 4.161639757461431e-06, + "loss": 0.641, + "step": 4437 + }, + { + "epoch": 1.6383214373615556, + "grad_norm": 0.8514406085014343, + "learning_rate": 4.16127633121519e-06, + "loss": 0.6602, + "step": 4438 + }, + { + "epoch": 1.6386906226925917, + "grad_norm": 0.8754714727401733, + "learning_rate": 4.160912842090162e-06, + "loss": 0.6715, + "step": 4439 + }, + { + "epoch": 1.6390598080236278, + "grad_norm": 0.8890629410743713, + "learning_rate": 4.1605492901001075e-06, + "loss": 0.6643, + "step": 4440 + }, + { + "epoch": 1.639428993354664, + "grad_norm": 0.8474121689796448, + "learning_rate": 4.1601856752587874e-06, + "loss": 0.6635, + "step": 4441 + }, + { + "epoch": 1.6397981786857003, + "grad_norm": 0.836716890335083, + "learning_rate": 4.159821997579963e-06, + "loss": 0.6163, + "step": 4442 + }, + { + "epoch": 1.6401673640167362, + "grad_norm": 0.8311505317687988, + "learning_rate": 4.1594582570774e-06, + "loss": 0.6616, + "step": 4443 + }, + { + "epoch": 1.6405365493477726, + "grad_norm": 0.8949893712997437, + "learning_rate": 4.1590944537648655e-06, + "loss": 0.6767, + "step": 4444 + }, + { + "epoch": 1.6409057346788087, + "grad_norm": 0.8867307305335999, + "learning_rate": 4.158730587656129e-06, + "loss": 0.6291, + "step": 4445 + }, + { + "epoch": 1.6412749200098449, + "grad_norm": 0.8814810514450073, + "learning_rate": 4.158366658764965e-06, + "loss": 0.6415, + "step": 4446 + }, + { + "epoch": 1.6416441053408812, + "grad_norm": 0.8695033192634583, + "learning_rate": 4.158002667105145e-06, + "loss": 0.6115, + "step": 4447 + }, + { + "epoch": 1.6420132906719171, + "grad_norm": 0.849412202835083, + "learning_rate": 4.157638612690448e-06, + "loss": 0.6679, + "step": 4448 + }, + { + "epoch": 1.6423824760029535, + "grad_norm": 0.8811483979225159, + "learning_rate": 4.157274495534651e-06, + "loss": 0.6533, + "step": 4449 + }, + { + "epoch": 1.6427516613339896, + "grad_norm": 0.8672307729721069, + "learning_rate": 4.156910315651539e-06, + "loss": 0.6561, + "step": 4450 + }, + { + "epoch": 1.6431208466650258, + "grad_norm": 0.8731364011764526, + "learning_rate": 4.156546073054894e-06, + "loss": 0.6626, + "step": 4451 + }, + { + "epoch": 1.6434900319960621, + "grad_norm": 0.8950603604316711, + "learning_rate": 4.156181767758503e-06, + "loss": 0.6638, + "step": 4452 + }, + { + "epoch": 1.643859217327098, + "grad_norm": 0.8686881065368652, + "learning_rate": 4.155817399776155e-06, + "loss": 0.6347, + "step": 4453 + }, + { + "epoch": 1.6442284026581344, + "grad_norm": 0.8460521697998047, + "learning_rate": 4.155452969121641e-06, + "loss": 0.6464, + "step": 4454 + }, + { + "epoch": 1.6445975879891706, + "grad_norm": 0.8888674974441528, + "learning_rate": 4.155088475808755e-06, + "loss": 0.6704, + "step": 4455 + }, + { + "epoch": 1.6449667733202067, + "grad_norm": 0.8738898634910583, + "learning_rate": 4.154723919851291e-06, + "loss": 0.6263, + "step": 4456 + }, + { + "epoch": 1.645335958651243, + "grad_norm": 0.8509227633476257, + "learning_rate": 4.15435930126305e-06, + "loss": 0.663, + "step": 4457 + }, + { + "epoch": 1.645705143982279, + "grad_norm": 0.8772385716438293, + "learning_rate": 4.153994620057832e-06, + "loss": 0.6645, + "step": 4458 + }, + { + "epoch": 1.6460743293133153, + "grad_norm": 0.8468128442764282, + "learning_rate": 4.153629876249438e-06, + "loss": 0.649, + "step": 4459 + }, + { + "epoch": 1.6464435146443515, + "grad_norm": 0.9028661251068115, + "learning_rate": 4.153265069851677e-06, + "loss": 0.6872, + "step": 4460 + }, + { + "epoch": 1.6468126999753876, + "grad_norm": 0.8842338919639587, + "learning_rate": 4.152900200878354e-06, + "loss": 0.6548, + "step": 4461 + }, + { + "epoch": 1.647181885306424, + "grad_norm": 0.9151118397712708, + "learning_rate": 4.15253526934328e-06, + "loss": 0.6641, + "step": 4462 + }, + { + "epoch": 1.6475510706374599, + "grad_norm": 0.8426380753517151, + "learning_rate": 4.1521702752602675e-06, + "loss": 0.583, + "step": 4463 + }, + { + "epoch": 1.6479202559684962, + "grad_norm": 0.8509050607681274, + "learning_rate": 4.151805218643132e-06, + "loss": 0.6938, + "step": 4464 + }, + { + "epoch": 1.6482894412995324, + "grad_norm": 0.868497371673584, + "learning_rate": 4.15144009950569e-06, + "loss": 0.6501, + "step": 4465 + }, + { + "epoch": 1.6486586266305685, + "grad_norm": 0.8626579642295837, + "learning_rate": 4.1510749178617615e-06, + "loss": 0.6887, + "step": 4466 + }, + { + "epoch": 1.6490278119616049, + "grad_norm": 0.8920122981071472, + "learning_rate": 4.150709673725168e-06, + "loss": 0.6751, + "step": 4467 + }, + { + "epoch": 1.6493969972926408, + "grad_norm": 0.8443697094917297, + "learning_rate": 4.150344367109735e-06, + "loss": 0.6649, + "step": 4468 + }, + { + "epoch": 1.6497661826236771, + "grad_norm": 0.8912809491157532, + "learning_rate": 4.149978998029288e-06, + "loss": 0.633, + "step": 4469 + }, + { + "epoch": 1.6501353679547133, + "grad_norm": 0.8347629904747009, + "learning_rate": 4.149613566497658e-06, + "loss": 0.629, + "step": 4470 + }, + { + "epoch": 1.6505045532857494, + "grad_norm": 0.8615502715110779, + "learning_rate": 4.149248072528673e-06, + "loss": 0.6617, + "step": 4471 + }, + { + "epoch": 1.6508737386167858, + "grad_norm": 0.8401784300804138, + "learning_rate": 4.14888251613617e-06, + "loss": 0.6571, + "step": 4472 + }, + { + "epoch": 1.6512429239478217, + "grad_norm": 0.8849835395812988, + "learning_rate": 4.148516897333984e-06, + "loss": 0.6143, + "step": 4473 + }, + { + "epoch": 1.651612109278858, + "grad_norm": 0.8616957664489746, + "learning_rate": 4.148151216135954e-06, + "loss": 0.6617, + "step": 4474 + }, + { + "epoch": 1.6519812946098942, + "grad_norm": 0.8619272708892822, + "learning_rate": 4.147785472555921e-06, + "loss": 0.633, + "step": 4475 + }, + { + "epoch": 1.6523504799409303, + "grad_norm": 0.9091795682907104, + "learning_rate": 4.147419666607727e-06, + "loss": 0.7013, + "step": 4476 + }, + { + "epoch": 1.6527196652719667, + "grad_norm": 0.8994285464286804, + "learning_rate": 4.14705379830522e-06, + "loss": 0.6601, + "step": 4477 + }, + { + "epoch": 1.6530888506030026, + "grad_norm": 0.8693494200706482, + "learning_rate": 4.146687867662245e-06, + "loss": 0.6776, + "step": 4478 + }, + { + "epoch": 1.653458035934039, + "grad_norm": 0.9726981520652771, + "learning_rate": 4.146321874692654e-06, + "loss": 0.6517, + "step": 4479 + }, + { + "epoch": 1.653827221265075, + "grad_norm": 0.8666999936103821, + "learning_rate": 4.145955819410301e-06, + "loss": 0.6987, + "step": 4480 + }, + { + "epoch": 1.6541964065961112, + "grad_norm": 0.8664538860321045, + "learning_rate": 4.145589701829038e-06, + "loss": 0.6436, + "step": 4481 + }, + { + "epoch": 1.6545655919271476, + "grad_norm": 0.8608652949333191, + "learning_rate": 4.145223521962725e-06, + "loss": 0.6246, + "step": 4482 + }, + { + "epoch": 1.6549347772581835, + "grad_norm": 0.8631076216697693, + "learning_rate": 4.144857279825221e-06, + "loss": 0.6513, + "step": 4483 + }, + { + "epoch": 1.6553039625892199, + "grad_norm": 0.891029417514801, + "learning_rate": 4.144490975430388e-06, + "loss": 0.6596, + "step": 4484 + }, + { + "epoch": 1.655673147920256, + "grad_norm": 0.9006115794181824, + "learning_rate": 4.14412460879209e-06, + "loss": 0.6599, + "step": 4485 + }, + { + "epoch": 1.6560423332512921, + "grad_norm": 0.8670785427093506, + "learning_rate": 4.143758179924195e-06, + "loss": 0.6337, + "step": 4486 + }, + { + "epoch": 1.6564115185823285, + "grad_norm": 0.8894757628440857, + "learning_rate": 4.143391688840572e-06, + "loss": 0.6816, + "step": 4487 + }, + { + "epoch": 1.6567807039133644, + "grad_norm": 0.8802720308303833, + "learning_rate": 4.143025135555092e-06, + "loss": 0.6407, + "step": 4488 + }, + { + "epoch": 1.6571498892444008, + "grad_norm": 0.8646578788757324, + "learning_rate": 4.142658520081629e-06, + "loss": 0.6385, + "step": 4489 + }, + { + "epoch": 1.657519074575437, + "grad_norm": 0.8891850709915161, + "learning_rate": 4.1422918424340595e-06, + "loss": 0.6576, + "step": 4490 + }, + { + "epoch": 1.657888259906473, + "grad_norm": 0.8665881156921387, + "learning_rate": 4.141925102626262e-06, + "loss": 0.6213, + "step": 4491 + }, + { + "epoch": 1.6582574452375092, + "grad_norm": 0.8758256435394287, + "learning_rate": 4.1415583006721175e-06, + "loss": 0.6402, + "step": 4492 + }, + { + "epoch": 1.6586266305685453, + "grad_norm": 0.8827167749404907, + "learning_rate": 4.14119143658551e-06, + "loss": 0.6418, + "step": 4493 + }, + { + "epoch": 1.6589958158995817, + "grad_norm": 0.8715447783470154, + "learning_rate": 4.140824510380324e-06, + "loss": 0.6646, + "step": 4494 + }, + { + "epoch": 1.6593650012306178, + "grad_norm": 0.8832272291183472, + "learning_rate": 4.140457522070449e-06, + "loss": 0.6185, + "step": 4495 + }, + { + "epoch": 1.659734186561654, + "grad_norm": 0.8672773241996765, + "learning_rate": 4.140090471669773e-06, + "loss": 0.66, + "step": 4496 + }, + { + "epoch": 1.66010337189269, + "grad_norm": 0.8404352068901062, + "learning_rate": 4.139723359192192e-06, + "loss": 0.6438, + "step": 4497 + }, + { + "epoch": 1.6604725572237262, + "grad_norm": 0.8741649985313416, + "learning_rate": 4.139356184651598e-06, + "loss": 0.6221, + "step": 4498 + }, + { + "epoch": 1.6608417425547626, + "grad_norm": 0.8387948870658875, + "learning_rate": 4.13898894806189e-06, + "loss": 0.6555, + "step": 4499 + }, + { + "epoch": 1.6612109278857987, + "grad_norm": 0.873931884765625, + "learning_rate": 4.138621649436968e-06, + "loss": 0.6669, + "step": 4500 + }, + { + "epoch": 1.6615801132168349, + "grad_norm": 0.8241584897041321, + "learning_rate": 4.138254288790733e-06, + "loss": 0.6319, + "step": 4501 + }, + { + "epoch": 1.661949298547871, + "grad_norm": 0.8467962145805359, + "learning_rate": 4.137886866137091e-06, + "loss": 0.6636, + "step": 4502 + }, + { + "epoch": 1.6623184838789071, + "grad_norm": 0.8999971747398376, + "learning_rate": 4.137519381489948e-06, + "loss": 0.6467, + "step": 4503 + }, + { + "epoch": 1.6626876692099435, + "grad_norm": 0.8832374215126038, + "learning_rate": 4.137151834863213e-06, + "loss": 0.6775, + "step": 4504 + }, + { + "epoch": 1.6630568545409796, + "grad_norm": 0.8683676719665527, + "learning_rate": 4.136784226270798e-06, + "loss": 0.6316, + "step": 4505 + }, + { + "epoch": 1.6634260398720158, + "grad_norm": 0.8753916621208191, + "learning_rate": 4.136416555726616e-06, + "loss": 0.6681, + "step": 4506 + }, + { + "epoch": 1.663795225203052, + "grad_norm": 0.8960619568824768, + "learning_rate": 4.136048823244585e-06, + "loss": 0.6584, + "step": 4507 + }, + { + "epoch": 1.664164410534088, + "grad_norm": 0.8313222527503967, + "learning_rate": 4.1356810288386205e-06, + "loss": 0.6227, + "step": 4508 + }, + { + "epoch": 1.6645335958651244, + "grad_norm": 0.8524779677391052, + "learning_rate": 4.135313172522647e-06, + "loss": 0.686, + "step": 4509 + }, + { + "epoch": 1.6649027811961603, + "grad_norm": 0.8916214108467102, + "learning_rate": 4.134945254310585e-06, + "loss": 0.6708, + "step": 4510 + }, + { + "epoch": 1.6652719665271967, + "grad_norm": 0.864311933517456, + "learning_rate": 4.1345772742163616e-06, + "loss": 0.6666, + "step": 4511 + }, + { + "epoch": 1.6656411518582328, + "grad_norm": 0.8986158967018127, + "learning_rate": 4.134209232253904e-06, + "loss": 0.6377, + "step": 4512 + }, + { + "epoch": 1.666010337189269, + "grad_norm": 0.8232037425041199, + "learning_rate": 4.133841128437144e-06, + "loss": 0.6739, + "step": 4513 + }, + { + "epoch": 1.6663795225203053, + "grad_norm": 0.8871540427207947, + "learning_rate": 4.133472962780011e-06, + "loss": 0.6218, + "step": 4514 + }, + { + "epoch": 1.6667487078513412, + "grad_norm": 0.8576182723045349, + "learning_rate": 4.133104735296442e-06, + "loss": 0.6777, + "step": 4515 + }, + { + "epoch": 1.6671178931823776, + "grad_norm": 0.8814231157302856, + "learning_rate": 4.132736446000376e-06, + "loss": 0.6487, + "step": 4516 + }, + { + "epoch": 1.6674870785134137, + "grad_norm": 0.876785933971405, + "learning_rate": 4.13236809490575e-06, + "loss": 0.6281, + "step": 4517 + }, + { + "epoch": 1.6678562638444498, + "grad_norm": 0.913142740726471, + "learning_rate": 4.131999682026506e-06, + "loss": 0.6418, + "step": 4518 + }, + { + "epoch": 1.6682254491754862, + "grad_norm": 0.8750118017196655, + "learning_rate": 4.13163120737659e-06, + "loss": 0.6302, + "step": 4519 + }, + { + "epoch": 1.6685946345065221, + "grad_norm": 0.8533949851989746, + "learning_rate": 4.131262670969947e-06, + "loss": 0.6669, + "step": 4520 + }, + { + "epoch": 1.6689638198375585, + "grad_norm": 0.8672575950622559, + "learning_rate": 4.130894072820527e-06, + "loss": 0.6309, + "step": 4521 + }, + { + "epoch": 1.6693330051685946, + "grad_norm": 0.8494476079940796, + "learning_rate": 4.130525412942281e-06, + "loss": 0.6626, + "step": 4522 + }, + { + "epoch": 1.6697021904996308, + "grad_norm": 0.8336800932884216, + "learning_rate": 4.1301566913491635e-06, + "loss": 0.6763, + "step": 4523 + }, + { + "epoch": 1.6700713758306671, + "grad_norm": 0.8762255311012268, + "learning_rate": 4.129787908055129e-06, + "loss": 0.6578, + "step": 4524 + }, + { + "epoch": 1.670440561161703, + "grad_norm": 0.8269521594047546, + "learning_rate": 4.1294190630741374e-06, + "loss": 0.6652, + "step": 4525 + }, + { + "epoch": 1.6708097464927394, + "grad_norm": 0.8518587350845337, + "learning_rate": 4.129050156420147e-06, + "loss": 0.6445, + "step": 4526 + }, + { + "epoch": 1.6711789318237755, + "grad_norm": 0.8928120136260986, + "learning_rate": 4.128681188107123e-06, + "loss": 0.6648, + "step": 4527 + }, + { + "epoch": 1.6715481171548117, + "grad_norm": 0.8664650321006775, + "learning_rate": 4.1283121581490306e-06, + "loss": 0.5903, + "step": 4528 + }, + { + "epoch": 1.671917302485848, + "grad_norm": 0.8605487942695618, + "learning_rate": 4.127943066559836e-06, + "loss": 0.6651, + "step": 4529 + }, + { + "epoch": 1.672286487816884, + "grad_norm": 0.8928911089897156, + "learning_rate": 4.127573913353511e-06, + "loss": 0.6235, + "step": 4530 + }, + { + "epoch": 1.6726556731479203, + "grad_norm": 0.8621736168861389, + "learning_rate": 4.127204698544026e-06, + "loss": 0.6059, + "step": 4531 + }, + { + "epoch": 1.6730248584789564, + "grad_norm": 0.8683192729949951, + "learning_rate": 4.126835422145358e-06, + "loss": 0.647, + "step": 4532 + }, + { + "epoch": 1.6733940438099926, + "grad_norm": 0.8855174779891968, + "learning_rate": 4.126466084171482e-06, + "loss": 0.6525, + "step": 4533 + }, + { + "epoch": 1.673763229141029, + "grad_norm": 0.8851457238197327, + "learning_rate": 4.126096684636378e-06, + "loss": 0.6679, + "step": 4534 + }, + { + "epoch": 1.6741324144720648, + "grad_norm": 0.8995987176895142, + "learning_rate": 4.125727223554028e-06, + "loss": 0.6603, + "step": 4535 + }, + { + "epoch": 1.6745015998031012, + "grad_norm": 0.8718790411949158, + "learning_rate": 4.125357700938415e-06, + "loss": 0.6588, + "step": 4536 + }, + { + "epoch": 1.6748707851341373, + "grad_norm": 0.8850767612457275, + "learning_rate": 4.124988116803527e-06, + "loss": 0.6459, + "step": 4537 + }, + { + "epoch": 1.6752399704651735, + "grad_norm": 0.9180246591567993, + "learning_rate": 4.124618471163351e-06, + "loss": 0.6943, + "step": 4538 + }, + { + "epoch": 1.6756091557962098, + "grad_norm": 0.8736065030097961, + "learning_rate": 4.124248764031879e-06, + "loss": 0.6523, + "step": 4539 + }, + { + "epoch": 1.6759783411272458, + "grad_norm": 0.8764031529426575, + "learning_rate": 4.123878995423103e-06, + "loss": 0.6761, + "step": 4540 + }, + { + "epoch": 1.6763475264582821, + "grad_norm": 0.8447579741477966, + "learning_rate": 4.12350916535102e-06, + "loss": 0.6441, + "step": 4541 + }, + { + "epoch": 1.6767167117893182, + "grad_norm": 0.87704998254776, + "learning_rate": 4.123139273829628e-06, + "loss": 0.6831, + "step": 4542 + }, + { + "epoch": 1.6770858971203544, + "grad_norm": 0.8688414096832275, + "learning_rate": 4.122769320872926e-06, + "loss": 0.6448, + "step": 4543 + }, + { + "epoch": 1.6774550824513907, + "grad_norm": 0.841620922088623, + "learning_rate": 4.122399306494918e-06, + "loss": 0.6575, + "step": 4544 + }, + { + "epoch": 1.6778242677824267, + "grad_norm": 0.8724623918533325, + "learning_rate": 4.122029230709608e-06, + "loss": 0.6301, + "step": 4545 + }, + { + "epoch": 1.678193453113463, + "grad_norm": 0.869983971118927, + "learning_rate": 4.121659093531003e-06, + "loss": 0.67, + "step": 4546 + }, + { + "epoch": 1.6785626384444992, + "grad_norm": 0.8612809777259827, + "learning_rate": 4.121288894973114e-06, + "loss": 0.6572, + "step": 4547 + }, + { + "epoch": 1.6789318237755353, + "grad_norm": 0.8596634864807129, + "learning_rate": 4.120918635049952e-06, + "loss": 0.6719, + "step": 4548 + }, + { + "epoch": 1.6793010091065717, + "grad_norm": 0.8874933123588562, + "learning_rate": 4.120548313775531e-06, + "loss": 0.6322, + "step": 4549 + }, + { + "epoch": 1.6796701944376076, + "grad_norm": 0.8563892245292664, + "learning_rate": 4.1201779311638675e-06, + "loss": 0.6023, + "step": 4550 + }, + { + "epoch": 1.680039379768644, + "grad_norm": 0.8582227826118469, + "learning_rate": 4.119807487228981e-06, + "loss": 0.6481, + "step": 4551 + }, + { + "epoch": 1.68040856509968, + "grad_norm": 0.8827866911888123, + "learning_rate": 4.119436981984892e-06, + "loss": 0.6598, + "step": 4552 + }, + { + "epoch": 1.6807777504307162, + "grad_norm": 0.8593311905860901, + "learning_rate": 4.119066415445624e-06, + "loss": 0.6502, + "step": 4553 + }, + { + "epoch": 1.6811469357617526, + "grad_norm": 0.8818646669387817, + "learning_rate": 4.1186957876252045e-06, + "loss": 0.6358, + "step": 4554 + }, + { + "epoch": 1.6815161210927885, + "grad_norm": 0.872857391834259, + "learning_rate": 4.118325098537659e-06, + "loss": 0.6366, + "step": 4555 + }, + { + "epoch": 1.6818853064238248, + "grad_norm": 0.8681260347366333, + "learning_rate": 4.11795434819702e-06, + "loss": 0.6489, + "step": 4556 + }, + { + "epoch": 1.682254491754861, + "grad_norm": 0.8904328346252441, + "learning_rate": 4.117583536617319e-06, + "loss": 0.6592, + "step": 4557 + }, + { + "epoch": 1.682623677085897, + "grad_norm": 0.8835660815238953, + "learning_rate": 4.117212663812592e-06, + "loss": 0.6835, + "step": 4558 + }, + { + "epoch": 1.6829928624169335, + "grad_norm": 0.8734532594680786, + "learning_rate": 4.116841729796875e-06, + "loss": 0.6139, + "step": 4559 + }, + { + "epoch": 1.6833620477479694, + "grad_norm": 0.8369070291519165, + "learning_rate": 4.116470734584211e-06, + "loss": 0.6478, + "step": 4560 + }, + { + "epoch": 1.6837312330790057, + "grad_norm": 0.8890385031700134, + "learning_rate": 4.116099678188639e-06, + "loss": 0.6746, + "step": 4561 + }, + { + "epoch": 1.6841004184100419, + "grad_norm": 0.8555858731269836, + "learning_rate": 4.115728560624203e-06, + "loss": 0.6995, + "step": 4562 + }, + { + "epoch": 1.684469603741078, + "grad_norm": 0.8777579665184021, + "learning_rate": 4.115357381904952e-06, + "loss": 0.6234, + "step": 4563 + }, + { + "epoch": 1.6848387890721142, + "grad_norm": 0.8944929838180542, + "learning_rate": 4.114986142044934e-06, + "loss": 0.6706, + "step": 4564 + }, + { + "epoch": 1.6852079744031503, + "grad_norm": 0.8800182342529297, + "learning_rate": 4.1146148410582e-06, + "loss": 0.6724, + "step": 4565 + }, + { + "epoch": 1.6855771597341866, + "grad_norm": 0.8617680072784424, + "learning_rate": 4.114243478958804e-06, + "loss": 0.6344, + "step": 4566 + }, + { + "epoch": 1.6859463450652228, + "grad_norm": 0.8661864995956421, + "learning_rate": 4.113872055760801e-06, + "loss": 0.6319, + "step": 4567 + }, + { + "epoch": 1.686315530396259, + "grad_norm": 0.8717219829559326, + "learning_rate": 4.11350057147825e-06, + "loss": 0.6739, + "step": 4568 + }, + { + "epoch": 1.686684715727295, + "grad_norm": 0.8687058091163635, + "learning_rate": 4.113129026125213e-06, + "loss": 0.6558, + "step": 4569 + }, + { + "epoch": 1.6870539010583312, + "grad_norm": 0.8416665196418762, + "learning_rate": 4.11275741971575e-06, + "loss": 0.6151, + "step": 4570 + }, + { + "epoch": 1.6874230863893676, + "grad_norm": 0.8461928963661194, + "learning_rate": 4.1123857522639285e-06, + "loss": 0.6357, + "step": 4571 + }, + { + "epoch": 1.6877922717204037, + "grad_norm": 0.8852282166481018, + "learning_rate": 4.112014023783815e-06, + "loss": 0.662, + "step": 4572 + }, + { + "epoch": 1.6881614570514398, + "grad_norm": 0.8673906922340393, + "learning_rate": 4.1116422342894805e-06, + "loss": 0.6454, + "step": 4573 + }, + { + "epoch": 1.688530642382476, + "grad_norm": 0.8580750226974487, + "learning_rate": 4.111270383794994e-06, + "loss": 0.6234, + "step": 4574 + }, + { + "epoch": 1.688899827713512, + "grad_norm": 0.8770503401756287, + "learning_rate": 4.110898472314433e-06, + "loss": 0.6513, + "step": 4575 + }, + { + "epoch": 1.6892690130445485, + "grad_norm": 0.8480761647224426, + "learning_rate": 4.110526499861873e-06, + "loss": 0.6204, + "step": 4576 + }, + { + "epoch": 1.6896381983755844, + "grad_norm": 0.89314204454422, + "learning_rate": 4.110154466451394e-06, + "loss": 0.6548, + "step": 4577 + }, + { + "epoch": 1.6900073837066207, + "grad_norm": 0.8610304594039917, + "learning_rate": 4.109782372097077e-06, + "loss": 0.6636, + "step": 4578 + }, + { + "epoch": 1.6903765690376569, + "grad_norm": 0.8885032534599304, + "learning_rate": 4.109410216813003e-06, + "loss": 0.6739, + "step": 4579 + }, + { + "epoch": 1.690745754368693, + "grad_norm": 0.8924837112426758, + "learning_rate": 4.109038000613263e-06, + "loss": 0.6467, + "step": 4580 + }, + { + "epoch": 1.6911149396997294, + "grad_norm": 0.8711414337158203, + "learning_rate": 4.10866572351194e-06, + "loss": 0.667, + "step": 4581 + }, + { + "epoch": 1.6914841250307653, + "grad_norm": 0.8909647464752197, + "learning_rate": 4.1082933855231286e-06, + "loss": 0.6513, + "step": 4582 + }, + { + "epoch": 1.6918533103618016, + "grad_norm": 0.8648284077644348, + "learning_rate": 4.10792098666092e-06, + "loss": 0.6656, + "step": 4583 + }, + { + "epoch": 1.6922224956928378, + "grad_norm": 0.8766292929649353, + "learning_rate": 4.107548526939409e-06, + "loss": 0.623, + "step": 4584 + }, + { + "epoch": 1.692591681023874, + "grad_norm": 0.8836564421653748, + "learning_rate": 4.107176006372694e-06, + "loss": 0.6858, + "step": 4585 + }, + { + "epoch": 1.6929608663549103, + "grad_norm": 0.8867332935333252, + "learning_rate": 4.106803424974875e-06, + "loss": 0.6384, + "step": 4586 + }, + { + "epoch": 1.6933300516859462, + "grad_norm": 0.8443453907966614, + "learning_rate": 4.106430782760052e-06, + "loss": 0.6365, + "step": 4587 + }, + { + "epoch": 1.6936992370169826, + "grad_norm": 0.8848949670791626, + "learning_rate": 4.106058079742332e-06, + "loss": 0.6464, + "step": 4588 + }, + { + "epoch": 1.6940684223480187, + "grad_norm": 0.8425247073173523, + "learning_rate": 4.10568531593582e-06, + "loss": 0.6754, + "step": 4589 + }, + { + "epoch": 1.6944376076790548, + "grad_norm": 0.8915621042251587, + "learning_rate": 4.105312491354626e-06, + "loss": 0.6577, + "step": 4590 + }, + { + "epoch": 1.6948067930100912, + "grad_norm": 0.8551478981971741, + "learning_rate": 4.10493960601286e-06, + "loss": 0.6237, + "step": 4591 + }, + { + "epoch": 1.695175978341127, + "grad_norm": 0.8689252138137817, + "learning_rate": 4.104566659924637e-06, + "loss": 0.6186, + "step": 4592 + }, + { + "epoch": 1.6955451636721635, + "grad_norm": 0.8632819056510925, + "learning_rate": 4.104193653104073e-06, + "loss": 0.6748, + "step": 4593 + }, + { + "epoch": 1.6959143490031996, + "grad_norm": 0.8866071105003357, + "learning_rate": 4.103820585565284e-06, + "loss": 0.6518, + "step": 4594 + }, + { + "epoch": 1.6962835343342357, + "grad_norm": 0.897201657295227, + "learning_rate": 4.1034474573223935e-06, + "loss": 0.625, + "step": 4595 + }, + { + "epoch": 1.696652719665272, + "grad_norm": 0.8575178980827332, + "learning_rate": 4.103074268389522e-06, + "loss": 0.6834, + "step": 4596 + }, + { + "epoch": 1.697021904996308, + "grad_norm": 0.926920473575592, + "learning_rate": 4.102701018780796e-06, + "loss": 0.6356, + "step": 4597 + }, + { + "epoch": 1.6973910903273444, + "grad_norm": 0.844546377658844, + "learning_rate": 4.102327708510341e-06, + "loss": 0.6386, + "step": 4598 + }, + { + "epoch": 1.6977602756583805, + "grad_norm": 0.8671624064445496, + "learning_rate": 4.101954337592289e-06, + "loss": 0.6595, + "step": 4599 + }, + { + "epoch": 1.6981294609894166, + "grad_norm": 0.89585942029953, + "learning_rate": 4.101580906040771e-06, + "loss": 0.6353, + "step": 4600 + }, + { + "epoch": 1.698498646320453, + "grad_norm": 0.8706204891204834, + "learning_rate": 4.10120741386992e-06, + "loss": 0.7025, + "step": 4601 + }, + { + "epoch": 1.698867831651489, + "grad_norm": 1.0157390832901, + "learning_rate": 4.100833861093875e-06, + "loss": 0.6364, + "step": 4602 + }, + { + "epoch": 1.6992370169825253, + "grad_norm": 0.8798866868019104, + "learning_rate": 4.100460247726773e-06, + "loss": 0.6338, + "step": 4603 + }, + { + "epoch": 1.6996062023135614, + "grad_norm": 0.9002965092658997, + "learning_rate": 4.100086573782757e-06, + "loss": 0.6667, + "step": 4604 + }, + { + "epoch": 1.6999753876445975, + "grad_norm": 0.8818557262420654, + "learning_rate": 4.099712839275966e-06, + "loss": 0.6625, + "step": 4605 + }, + { + "epoch": 1.700344572975634, + "grad_norm": 0.8560909628868103, + "learning_rate": 4.099339044220552e-06, + "loss": 0.6603, + "step": 4606 + }, + { + "epoch": 1.7007137583066698, + "grad_norm": 0.848798394203186, + "learning_rate": 4.098965188630659e-06, + "loss": 0.6284, + "step": 4607 + }, + { + "epoch": 1.7010829436377062, + "grad_norm": 0.8697683811187744, + "learning_rate": 4.098591272520438e-06, + "loss": 0.6344, + "step": 4608 + }, + { + "epoch": 1.7014521289687423, + "grad_norm": 0.8870057463645935, + "learning_rate": 4.098217295904042e-06, + "loss": 0.6847, + "step": 4609 + }, + { + "epoch": 1.7018213142997785, + "grad_norm": 0.8565171957015991, + "learning_rate": 4.0978432587956265e-06, + "loss": 0.6699, + "step": 4610 + }, + { + "epoch": 1.7021904996308148, + "grad_norm": 0.8559092879295349, + "learning_rate": 4.097469161209346e-06, + "loss": 0.6338, + "step": 4611 + }, + { + "epoch": 1.7025596849618507, + "grad_norm": 0.8519883751869202, + "learning_rate": 4.097095003159364e-06, + "loss": 0.6789, + "step": 4612 + }, + { + "epoch": 1.702928870292887, + "grad_norm": 0.8838430643081665, + "learning_rate": 4.096720784659839e-06, + "loss": 0.683, + "step": 4613 + }, + { + "epoch": 1.7032980556239232, + "grad_norm": 0.8447897434234619, + "learning_rate": 4.096346505724936e-06, + "loss": 0.6664, + "step": 4614 + }, + { + "epoch": 1.7036672409549594, + "grad_norm": 0.9190188050270081, + "learning_rate": 4.095972166368822e-06, + "loss": 0.6591, + "step": 4615 + }, + { + "epoch": 1.7040364262859957, + "grad_norm": 0.8706026673316956, + "learning_rate": 4.095597766605667e-06, + "loss": 0.6447, + "step": 4616 + }, + { + "epoch": 1.7044056116170316, + "grad_norm": 0.8767640590667725, + "learning_rate": 4.095223306449638e-06, + "loss": 0.6595, + "step": 4617 + }, + { + "epoch": 1.704774796948068, + "grad_norm": 0.8558087348937988, + "learning_rate": 4.09484878591491e-06, + "loss": 0.6777, + "step": 4618 + }, + { + "epoch": 1.7051439822791041, + "grad_norm": 0.8823983669281006, + "learning_rate": 4.09447420501566e-06, + "loss": 0.6789, + "step": 4619 + }, + { + "epoch": 1.7055131676101403, + "grad_norm": 0.8649885654449463, + "learning_rate": 4.094099563766064e-06, + "loss": 0.6319, + "step": 4620 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 0.8672237396240234, + "learning_rate": 4.093724862180303e-06, + "loss": 0.6982, + "step": 4621 + }, + { + "epoch": 1.7062515382722125, + "grad_norm": 0.8662331104278564, + "learning_rate": 4.0933501002725586e-06, + "loss": 0.638, + "step": 4622 + }, + { + "epoch": 1.706620723603249, + "grad_norm": 0.8439859747886658, + "learning_rate": 4.092975278057016e-06, + "loss": 0.6379, + "step": 4623 + }, + { + "epoch": 1.706989908934285, + "grad_norm": 0.8683053255081177, + "learning_rate": 4.092600395547861e-06, + "loss": 0.692, + "step": 4624 + }, + { + "epoch": 1.7073590942653212, + "grad_norm": 0.8957347273826599, + "learning_rate": 4.0922254527592845e-06, + "loss": 0.637, + "step": 4625 + }, + { + "epoch": 1.7077282795963575, + "grad_norm": 0.8247810006141663, + "learning_rate": 4.0918504497054775e-06, + "loss": 0.6531, + "step": 4626 + }, + { + "epoch": 1.7080974649273934, + "grad_norm": 0.8411783576011658, + "learning_rate": 4.091475386400633e-06, + "loss": 0.6115, + "step": 4627 + }, + { + "epoch": 1.7084666502584298, + "grad_norm": 0.8703910112380981, + "learning_rate": 4.091100262858947e-06, + "loss": 0.6605, + "step": 4628 + }, + { + "epoch": 1.708835835589466, + "grad_norm": 0.8796367049217224, + "learning_rate": 4.090725079094618e-06, + "loss": 0.652, + "step": 4629 + }, + { + "epoch": 1.709205020920502, + "grad_norm": 0.8796310424804688, + "learning_rate": 4.0903498351218475e-06, + "loss": 0.6246, + "step": 4630 + }, + { + "epoch": 1.7095742062515382, + "grad_norm": 0.8362928032875061, + "learning_rate": 4.089974530954838e-06, + "loss": 0.6215, + "step": 4631 + }, + { + "epoch": 1.7099433915825744, + "grad_norm": 0.8719574213027954, + "learning_rate": 4.089599166607794e-06, + "loss": 0.626, + "step": 4632 + }, + { + "epoch": 1.7103125769136107, + "grad_norm": 0.8787086606025696, + "learning_rate": 4.0892237420949224e-06, + "loss": 0.6678, + "step": 4633 + }, + { + "epoch": 1.7106817622446469, + "grad_norm": 0.9067367911338806, + "learning_rate": 4.088848257430435e-06, + "loss": 0.6914, + "step": 4634 + }, + { + "epoch": 1.711050947575683, + "grad_norm": 0.8640689849853516, + "learning_rate": 4.088472712628541e-06, + "loss": 0.6502, + "step": 4635 + }, + { + "epoch": 1.7114201329067191, + "grad_norm": 0.8909751176834106, + "learning_rate": 4.088097107703457e-06, + "loss": 0.682, + "step": 4636 + }, + { + "epoch": 1.7117893182377553, + "grad_norm": 0.8807544708251953, + "learning_rate": 4.087721442669399e-06, + "loss": 0.6781, + "step": 4637 + }, + { + "epoch": 1.7121585035687916, + "grad_norm": 0.8601831793785095, + "learning_rate": 4.087345717540585e-06, + "loss": 0.6581, + "step": 4638 + }, + { + "epoch": 1.7125276888998278, + "grad_norm": 0.8632639050483704, + "learning_rate": 4.086969932331236e-06, + "loss": 0.6842, + "step": 4639 + }, + { + "epoch": 1.712896874230864, + "grad_norm": 0.8653634786605835, + "learning_rate": 4.0865940870555774e-06, + "loss": 0.6524, + "step": 4640 + }, + { + "epoch": 1.7132660595619, + "grad_norm": 0.8744351267814636, + "learning_rate": 4.086218181727832e-06, + "loss": 0.6414, + "step": 4641 + }, + { + "epoch": 1.7136352448929362, + "grad_norm": 0.8866009712219238, + "learning_rate": 4.08584221636223e-06, + "loss": 0.656, + "step": 4642 + }, + { + "epoch": 1.7140044302239725, + "grad_norm": 0.8712090253829956, + "learning_rate": 4.085466190973e-06, + "loss": 0.6533, + "step": 4643 + }, + { + "epoch": 1.7143736155550087, + "grad_norm": 0.8651918768882751, + "learning_rate": 4.085090105574376e-06, + "loss": 0.6127, + "step": 4644 + }, + { + "epoch": 1.7147428008860448, + "grad_norm": 0.8509892821311951, + "learning_rate": 4.084713960180591e-06, + "loss": 0.6658, + "step": 4645 + }, + { + "epoch": 1.715111986217081, + "grad_norm": 0.8799416422843933, + "learning_rate": 4.084337754805883e-06, + "loss": 0.6959, + "step": 4646 + }, + { + "epoch": 1.715481171548117, + "grad_norm": 0.8890732526779175, + "learning_rate": 4.08396148946449e-06, + "loss": 0.6448, + "step": 4647 + }, + { + "epoch": 1.7158503568791534, + "grad_norm": 0.8915311694145203, + "learning_rate": 4.083585164170655e-06, + "loss": 0.5998, + "step": 4648 + }, + { + "epoch": 1.7162195422101894, + "grad_norm": 0.8922033905982971, + "learning_rate": 4.083208778938622e-06, + "loss": 0.6453, + "step": 4649 + }, + { + "epoch": 1.7165887275412257, + "grad_norm": 0.8715018630027771, + "learning_rate": 4.082832333782637e-06, + "loss": 0.6533, + "step": 4650 + }, + { + "epoch": 1.7169579128722618, + "grad_norm": 0.8740409016609192, + "learning_rate": 4.082455828716946e-06, + "loss": 0.6496, + "step": 4651 + }, + { + "epoch": 1.717327098203298, + "grad_norm": 0.8579672574996948, + "learning_rate": 4.082079263755802e-06, + "loss": 0.6324, + "step": 4652 + }, + { + "epoch": 1.7176962835343343, + "grad_norm": 0.8562673330307007, + "learning_rate": 4.081702638913457e-06, + "loss": 0.6525, + "step": 4653 + }, + { + "epoch": 1.7180654688653703, + "grad_norm": 0.8806090354919434, + "learning_rate": 4.081325954204167e-06, + "loss": 0.6251, + "step": 4654 + }, + { + "epoch": 1.7184346541964066, + "grad_norm": 0.8676470518112183, + "learning_rate": 4.0809492096421885e-06, + "loss": 0.6678, + "step": 4655 + }, + { + "epoch": 1.7188038395274428, + "grad_norm": 0.8654783368110657, + "learning_rate": 4.080572405241781e-06, + "loss": 0.6513, + "step": 4656 + }, + { + "epoch": 1.719173024858479, + "grad_norm": 0.8825175762176514, + "learning_rate": 4.080195541017207e-06, + "loss": 0.6758, + "step": 4657 + }, + { + "epoch": 1.7195422101895153, + "grad_norm": 0.887965202331543, + "learning_rate": 4.07981861698273e-06, + "loss": 0.63, + "step": 4658 + }, + { + "epoch": 1.7199113955205512, + "grad_norm": 0.8507312536239624, + "learning_rate": 4.0794416331526176e-06, + "loss": 0.6988, + "step": 4659 + }, + { + "epoch": 1.7202805808515875, + "grad_norm": 0.852523148059845, + "learning_rate": 4.079064589541137e-06, + "loss": 0.6121, + "step": 4660 + }, + { + "epoch": 1.7206497661826237, + "grad_norm": 0.8735987544059753, + "learning_rate": 4.078687486162561e-06, + "loss": 0.6608, + "step": 4661 + }, + { + "epoch": 1.7210189515136598, + "grad_norm": 0.8827378749847412, + "learning_rate": 4.078310323031161e-06, + "loss": 0.6813, + "step": 4662 + }, + { + "epoch": 1.7213881368446962, + "grad_norm": 0.897667407989502, + "learning_rate": 4.077933100161215e-06, + "loss": 0.698, + "step": 4663 + }, + { + "epoch": 1.721757322175732, + "grad_norm": 0.8755953311920166, + "learning_rate": 4.077555817566999e-06, + "loss": 0.6543, + "step": 4664 + }, + { + "epoch": 1.7221265075067684, + "grad_norm": 0.8454548716545105, + "learning_rate": 4.077178475262792e-06, + "loss": 0.6326, + "step": 4665 + }, + { + "epoch": 1.7224956928378046, + "grad_norm": 0.8439687490463257, + "learning_rate": 4.076801073262877e-06, + "loss": 0.6391, + "step": 4666 + }, + { + "epoch": 1.7228648781688407, + "grad_norm": 0.875583291053772, + "learning_rate": 4.07642361158154e-06, + "loss": 0.6183, + "step": 4667 + }, + { + "epoch": 1.723234063499877, + "grad_norm": 0.8914098739624023, + "learning_rate": 4.076046090233067e-06, + "loss": 0.6669, + "step": 4668 + }, + { + "epoch": 1.723603248830913, + "grad_norm": 0.8812105059623718, + "learning_rate": 4.075668509231746e-06, + "loss": 0.6333, + "step": 4669 + }, + { + "epoch": 1.7239724341619493, + "grad_norm": 0.8581550717353821, + "learning_rate": 4.07529086859187e-06, + "loss": 0.6477, + "step": 4670 + }, + { + "epoch": 1.7243416194929855, + "grad_norm": 0.8681442141532898, + "learning_rate": 4.074913168327731e-06, + "loss": 0.6243, + "step": 4671 + }, + { + "epoch": 1.7247108048240216, + "grad_norm": 0.8650044202804565, + "learning_rate": 4.074535408453626e-06, + "loss": 0.6887, + "step": 4672 + }, + { + "epoch": 1.725079990155058, + "grad_norm": 0.8634905815124512, + "learning_rate": 4.074157588983854e-06, + "loss": 0.6898, + "step": 4673 + }, + { + "epoch": 1.725449175486094, + "grad_norm": 0.896939218044281, + "learning_rate": 4.073779709932713e-06, + "loss": 0.6505, + "step": 4674 + }, + { + "epoch": 1.7258183608171302, + "grad_norm": 0.8606363534927368, + "learning_rate": 4.0734017713145055e-06, + "loss": 0.6272, + "step": 4675 + }, + { + "epoch": 1.7261875461481664, + "grad_norm": 0.8594246506690979, + "learning_rate": 4.073023773143539e-06, + "loss": 0.6733, + "step": 4676 + }, + { + "epoch": 1.7265567314792025, + "grad_norm": 0.8919916152954102, + "learning_rate": 4.0726457154341185e-06, + "loss": 0.6281, + "step": 4677 + }, + { + "epoch": 1.7269259168102389, + "grad_norm": 0.8576353192329407, + "learning_rate": 4.072267598200554e-06, + "loss": 0.6729, + "step": 4678 + }, + { + "epoch": 1.7272951021412748, + "grad_norm": 0.8682138919830322, + "learning_rate": 4.071889421457158e-06, + "loss": 0.6471, + "step": 4679 + }, + { + "epoch": 1.7276642874723112, + "grad_norm": 0.8584091067314148, + "learning_rate": 4.071511185218243e-06, + "loss": 0.6794, + "step": 4680 + }, + { + "epoch": 1.7280334728033473, + "grad_norm": 0.8807277083396912, + "learning_rate": 4.071132889498125e-06, + "loss": 0.6101, + "step": 4681 + }, + { + "epoch": 1.7284026581343834, + "grad_norm": 0.8279962539672852, + "learning_rate": 4.070754534311124e-06, + "loss": 0.6813, + "step": 4682 + }, + { + "epoch": 1.7287718434654198, + "grad_norm": 0.8616191744804382, + "learning_rate": 4.070376119671559e-06, + "loss": 0.6733, + "step": 4683 + }, + { + "epoch": 1.7291410287964557, + "grad_norm": 0.8546873927116394, + "learning_rate": 4.069997645593754e-06, + "loss": 0.6779, + "step": 4684 + }, + { + "epoch": 1.729510214127492, + "grad_norm": 0.9035440683364868, + "learning_rate": 4.0696191120920325e-06, + "loss": 0.6852, + "step": 4685 + }, + { + "epoch": 1.7298793994585282, + "grad_norm": 0.8769107460975647, + "learning_rate": 4.069240519180724e-06, + "loss": 0.6719, + "step": 4686 + }, + { + "epoch": 1.7302485847895643, + "grad_norm": 0.8778622150421143, + "learning_rate": 4.068861866874157e-06, + "loss": 0.6633, + "step": 4687 + }, + { + "epoch": 1.7306177701206007, + "grad_norm": 0.8511674404144287, + "learning_rate": 4.068483155186663e-06, + "loss": 0.6356, + "step": 4688 + }, + { + "epoch": 1.7309869554516366, + "grad_norm": 0.8530117869377136, + "learning_rate": 4.0681043841325775e-06, + "loss": 0.6867, + "step": 4689 + }, + { + "epoch": 1.731356140782673, + "grad_norm": 0.8720733523368835, + "learning_rate": 4.067725553726235e-06, + "loss": 0.659, + "step": 4690 + }, + { + "epoch": 1.731725326113709, + "grad_norm": 0.8644649982452393, + "learning_rate": 4.067346663981976e-06, + "loss": 0.6741, + "step": 4691 + }, + { + "epoch": 1.7320945114447452, + "grad_norm": 0.8757320046424866, + "learning_rate": 4.06696771491414e-06, + "loss": 0.6213, + "step": 4692 + }, + { + "epoch": 1.7324636967757816, + "grad_norm": 0.863145649433136, + "learning_rate": 4.06658870653707e-06, + "loss": 0.6329, + "step": 4693 + }, + { + "epoch": 1.7328328821068175, + "grad_norm": 0.8454107642173767, + "learning_rate": 4.066209638865113e-06, + "loss": 0.6653, + "step": 4694 + }, + { + "epoch": 1.7332020674378539, + "grad_norm": 0.8701609373092651, + "learning_rate": 4.065830511912614e-06, + "loss": 0.6311, + "step": 4695 + }, + { + "epoch": 1.73357125276889, + "grad_norm": 0.9261037111282349, + "learning_rate": 4.065451325693926e-06, + "loss": 0.6493, + "step": 4696 + }, + { + "epoch": 1.7339404380999262, + "grad_norm": 0.8801414966583252, + "learning_rate": 4.065072080223398e-06, + "loss": 0.6611, + "step": 4697 + }, + { + "epoch": 1.7343096234309623, + "grad_norm": 0.9089010953903198, + "learning_rate": 4.064692775515386e-06, + "loss": 0.6563, + "step": 4698 + }, + { + "epoch": 1.7346788087619984, + "grad_norm": 0.8913570046424866, + "learning_rate": 4.064313411584247e-06, + "loss": 0.6502, + "step": 4699 + }, + { + "epoch": 1.7350479940930348, + "grad_norm": 0.8645070791244507, + "learning_rate": 4.063933988444338e-06, + "loss": 0.6443, + "step": 4700 + }, + { + "epoch": 1.735417179424071, + "grad_norm": 0.8321308493614197, + "learning_rate": 4.063554506110021e-06, + "loss": 0.6512, + "step": 4701 + }, + { + "epoch": 1.735786364755107, + "grad_norm": 0.8726292848587036, + "learning_rate": 4.063174964595661e-06, + "loss": 0.6311, + "step": 4702 + }, + { + "epoch": 1.7361555500861432, + "grad_norm": 0.8812772631645203, + "learning_rate": 4.06279536391562e-06, + "loss": 0.6239, + "step": 4703 + }, + { + "epoch": 1.7365247354171793, + "grad_norm": 0.8556762337684631, + "learning_rate": 4.062415704084268e-06, + "loss": 0.6244, + "step": 4704 + }, + { + "epoch": 1.7368939207482157, + "grad_norm": 0.8610290884971619, + "learning_rate": 4.062035985115976e-06, + "loss": 0.6425, + "step": 4705 + }, + { + "epoch": 1.7372631060792518, + "grad_norm": 0.8555170893669128, + "learning_rate": 4.061656207025115e-06, + "loss": 0.6573, + "step": 4706 + }, + { + "epoch": 1.737632291410288, + "grad_norm": 0.8405477404594421, + "learning_rate": 4.061276369826058e-06, + "loss": 0.6354, + "step": 4707 + }, + { + "epoch": 1.738001476741324, + "grad_norm": 0.8742788434028625, + "learning_rate": 4.060896473533185e-06, + "loss": 0.629, + "step": 4708 + }, + { + "epoch": 1.7383706620723602, + "grad_norm": 0.8465863466262817, + "learning_rate": 4.060516518160871e-06, + "loss": 0.5775, + "step": 4709 + }, + { + "epoch": 1.7387398474033966, + "grad_norm": 0.8308393359184265, + "learning_rate": 4.060136503723501e-06, + "loss": 0.6712, + "step": 4710 + }, + { + "epoch": 1.7391090327344327, + "grad_norm": 0.8817639350891113, + "learning_rate": 4.059756430235456e-06, + "loss": 0.6347, + "step": 4711 + }, + { + "epoch": 1.7394782180654689, + "grad_norm": 0.854555070400238, + "learning_rate": 4.059376297711123e-06, + "loss": 0.607, + "step": 4712 + }, + { + "epoch": 1.739847403396505, + "grad_norm": 0.8577508926391602, + "learning_rate": 4.058996106164888e-06, + "loss": 0.6452, + "step": 4713 + }, + { + "epoch": 1.7402165887275411, + "grad_norm": 0.8582058548927307, + "learning_rate": 4.058615855611144e-06, + "loss": 0.6767, + "step": 4714 + }, + { + "epoch": 1.7405857740585775, + "grad_norm": 0.9054043292999268, + "learning_rate": 4.05823554606428e-06, + "loss": 0.6457, + "step": 4715 + }, + { + "epoch": 1.7409549593896134, + "grad_norm": 0.8792847990989685, + "learning_rate": 4.0578551775386935e-06, + "loss": 0.6406, + "step": 4716 + }, + { + "epoch": 1.7413241447206498, + "grad_norm": 0.8774515986442566, + "learning_rate": 4.0574747500487796e-06, + "loss": 0.6475, + "step": 4717 + }, + { + "epoch": 1.741693330051686, + "grad_norm": 0.8848661184310913, + "learning_rate": 4.0570942636089385e-06, + "loss": 0.6446, + "step": 4718 + }, + { + "epoch": 1.742062515382722, + "grad_norm": 0.8688724040985107, + "learning_rate": 4.056713718233571e-06, + "loss": 0.5993, + "step": 4719 + }, + { + "epoch": 1.7424317007137584, + "grad_norm": 0.8344020247459412, + "learning_rate": 4.05633311393708e-06, + "loss": 0.6417, + "step": 4720 + }, + { + "epoch": 1.7428008860447943, + "grad_norm": 0.8992648720741272, + "learning_rate": 4.055952450733872e-06, + "loss": 0.633, + "step": 4721 + }, + { + "epoch": 1.7431700713758307, + "grad_norm": 0.8486548066139221, + "learning_rate": 4.055571728638356e-06, + "loss": 0.6617, + "step": 4722 + }, + { + "epoch": 1.7435392567068668, + "grad_norm": 0.9176982045173645, + "learning_rate": 4.055190947664939e-06, + "loss": 0.6549, + "step": 4723 + }, + { + "epoch": 1.743908442037903, + "grad_norm": 0.8482176065444946, + "learning_rate": 4.054810107828037e-06, + "loss": 0.6594, + "step": 4724 + }, + { + "epoch": 1.7442776273689393, + "grad_norm": 0.9357988238334656, + "learning_rate": 4.054429209142062e-06, + "loss": 0.6332, + "step": 4725 + }, + { + "epoch": 1.7446468126999752, + "grad_norm": 0.9330418705940247, + "learning_rate": 4.054048251621433e-06, + "loss": 0.6446, + "step": 4726 + }, + { + "epoch": 1.7450159980310116, + "grad_norm": 0.838065505027771, + "learning_rate": 4.0536672352805675e-06, + "loss": 0.633, + "step": 4727 + }, + { + "epoch": 1.7453851833620477, + "grad_norm": 0.8648725748062134, + "learning_rate": 4.053286160133887e-06, + "loss": 0.6439, + "step": 4728 + }, + { + "epoch": 1.7457543686930839, + "grad_norm": 0.8745588660240173, + "learning_rate": 4.052905026195817e-06, + "loss": 0.6698, + "step": 4729 + }, + { + "epoch": 1.7461235540241202, + "grad_norm": 0.8751450777053833, + "learning_rate": 4.052523833480781e-06, + "loss": 0.6525, + "step": 4730 + }, + { + "epoch": 1.7464927393551561, + "grad_norm": 0.8626270890235901, + "learning_rate": 4.052142582003208e-06, + "loss": 0.6593, + "step": 4731 + }, + { + "epoch": 1.7468619246861925, + "grad_norm": 0.8970133662223816, + "learning_rate": 4.051761271777528e-06, + "loss": 0.6479, + "step": 4732 + }, + { + "epoch": 1.7472311100172286, + "grad_norm": 0.852325975894928, + "learning_rate": 4.051379902818173e-06, + "loss": 0.6177, + "step": 4733 + }, + { + "epoch": 1.7476002953482648, + "grad_norm": 0.8505515456199646, + "learning_rate": 4.0509984751395795e-06, + "loss": 0.6598, + "step": 4734 + }, + { + "epoch": 1.7479694806793011, + "grad_norm": 0.8522846698760986, + "learning_rate": 4.050616988756183e-06, + "loss": 0.6679, + "step": 4735 + }, + { + "epoch": 1.748338666010337, + "grad_norm": 0.8759461641311646, + "learning_rate": 4.050235443682423e-06, + "loss": 0.6484, + "step": 4736 + }, + { + "epoch": 1.7487078513413734, + "grad_norm": 0.8944045305252075, + "learning_rate": 4.04985383993274e-06, + "loss": 0.616, + "step": 4737 + }, + { + "epoch": 1.7490770366724095, + "grad_norm": 0.8629976511001587, + "learning_rate": 4.0494721775215784e-06, + "loss": 0.6644, + "step": 4738 + }, + { + "epoch": 1.7494462220034457, + "grad_norm": 0.8817481994628906, + "learning_rate": 4.049090456463385e-06, + "loss": 0.6865, + "step": 4739 + }, + { + "epoch": 1.749815407334482, + "grad_norm": 0.8944889307022095, + "learning_rate": 4.0487086767726054e-06, + "loss": 0.6492, + "step": 4740 + }, + { + "epoch": 1.750184592665518, + "grad_norm": 0.88465815782547, + "learning_rate": 4.048326838463692e-06, + "loss": 0.6496, + "step": 4741 + }, + { + "epoch": 1.7505537779965543, + "grad_norm": 0.8476895689964294, + "learning_rate": 4.047944941551095e-06, + "loss": 0.651, + "step": 4742 + }, + { + "epoch": 1.7509229633275905, + "grad_norm": 0.844473659992218, + "learning_rate": 4.047562986049272e-06, + "loss": 0.6572, + "step": 4743 + }, + { + "epoch": 1.7512921486586266, + "grad_norm": 0.828412652015686, + "learning_rate": 4.047180971972678e-06, + "loss": 0.6246, + "step": 4744 + }, + { + "epoch": 1.751661333989663, + "grad_norm": 0.8667641282081604, + "learning_rate": 4.046798899335773e-06, + "loss": 0.6372, + "step": 4745 + }, + { + "epoch": 1.7520305193206989, + "grad_norm": 0.8535826802253723, + "learning_rate": 4.046416768153017e-06, + "loss": 0.6502, + "step": 4746 + }, + { + "epoch": 1.7523997046517352, + "grad_norm": 0.9097661972045898, + "learning_rate": 4.046034578438875e-06, + "loss": 0.6653, + "step": 4747 + }, + { + "epoch": 1.7527688899827714, + "grad_norm": 0.8684583902359009, + "learning_rate": 4.045652330207811e-06, + "loss": 0.6386, + "step": 4748 + }, + { + "epoch": 1.7531380753138075, + "grad_norm": 0.8628401756286621, + "learning_rate": 4.0452700234742955e-06, + "loss": 0.6537, + "step": 4749 + }, + { + "epoch": 1.7535072606448439, + "grad_norm": 0.8484372496604919, + "learning_rate": 4.044887658252796e-06, + "loss": 0.6245, + "step": 4750 + }, + { + "epoch": 1.7538764459758798, + "grad_norm": 0.8299760222434998, + "learning_rate": 4.044505234557787e-06, + "loss": 0.6709, + "step": 4751 + }, + { + "epoch": 1.7542456313069161, + "grad_norm": 0.860662579536438, + "learning_rate": 4.044122752403743e-06, + "loss": 0.6664, + "step": 4752 + }, + { + "epoch": 1.7546148166379523, + "grad_norm": 0.8510186076164246, + "learning_rate": 4.04374021180514e-06, + "loss": 0.6503, + "step": 4753 + }, + { + "epoch": 1.7549840019689884, + "grad_norm": 0.8756953477859497, + "learning_rate": 4.0433576127764566e-06, + "loss": 0.6101, + "step": 4754 + }, + { + "epoch": 1.7553531873000248, + "grad_norm": 0.8664646744728088, + "learning_rate": 4.042974955332174e-06, + "loss": 0.6323, + "step": 4755 + }, + { + "epoch": 1.7557223726310607, + "grad_norm": 0.8667174577713013, + "learning_rate": 4.0425922394867775e-06, + "loss": 0.6185, + "step": 4756 + }, + { + "epoch": 1.756091557962097, + "grad_norm": 0.8867258429527283, + "learning_rate": 4.04220946525475e-06, + "loss": 0.6524, + "step": 4757 + }, + { + "epoch": 1.7564607432931332, + "grad_norm": 0.8428564071655273, + "learning_rate": 4.041826632650583e-06, + "loss": 0.6732, + "step": 4758 + }, + { + "epoch": 1.7568299286241693, + "grad_norm": 0.8574984669685364, + "learning_rate": 4.041443741688763e-06, + "loss": 0.694, + "step": 4759 + }, + { + "epoch": 1.7571991139552057, + "grad_norm": 0.8868323564529419, + "learning_rate": 4.041060792383785e-06, + "loss": 0.6204, + "step": 4760 + }, + { + "epoch": 1.7575682992862416, + "grad_norm": 0.8907110095024109, + "learning_rate": 4.040677784750142e-06, + "loss": 0.6416, + "step": 4761 + }, + { + "epoch": 1.757937484617278, + "grad_norm": 0.8632540702819824, + "learning_rate": 4.04029471880233e-06, + "loss": 0.6573, + "step": 4762 + }, + { + "epoch": 1.758306669948314, + "grad_norm": 0.907950222492218, + "learning_rate": 4.03991159455485e-06, + "loss": 0.6567, + "step": 4763 + }, + { + "epoch": 1.7586758552793502, + "grad_norm": 0.846466064453125, + "learning_rate": 4.039528412022203e-06, + "loss": 0.633, + "step": 4764 + }, + { + "epoch": 1.7590450406103866, + "grad_norm": 0.8322811722755432, + "learning_rate": 4.03914517121889e-06, + "loss": 0.6541, + "step": 4765 + }, + { + "epoch": 1.7594142259414225, + "grad_norm": 0.8657845854759216, + "learning_rate": 4.038761872159419e-06, + "loss": 0.6637, + "step": 4766 + }, + { + "epoch": 1.7597834112724589, + "grad_norm": 0.852941632270813, + "learning_rate": 4.038378514858296e-06, + "loss": 0.6657, + "step": 4767 + }, + { + "epoch": 1.760152596603495, + "grad_norm": 0.8669825196266174, + "learning_rate": 4.037995099330032e-06, + "loss": 0.6333, + "step": 4768 + }, + { + "epoch": 1.7605217819345311, + "grad_norm": 0.8583007454872131, + "learning_rate": 4.03761162558914e-06, + "loss": 0.6368, + "step": 4769 + }, + { + "epoch": 1.7608909672655673, + "grad_norm": 0.8543320894241333, + "learning_rate": 4.037228093650132e-06, + "loss": 0.6532, + "step": 4770 + }, + { + "epoch": 1.7612601525966034, + "grad_norm": 0.872880220413208, + "learning_rate": 4.0368445035275256e-06, + "loss": 0.6525, + "step": 4771 + }, + { + "epoch": 1.7616293379276398, + "grad_norm": 0.8947968482971191, + "learning_rate": 4.0364608552358404e-06, + "loss": 0.6748, + "step": 4772 + }, + { + "epoch": 1.761998523258676, + "grad_norm": 0.860569417476654, + "learning_rate": 4.036077148789596e-06, + "loss": 0.6309, + "step": 4773 + }, + { + "epoch": 1.762367708589712, + "grad_norm": 0.8578668236732483, + "learning_rate": 4.0356933842033174e-06, + "loss": 0.6444, + "step": 4774 + }, + { + "epoch": 1.7627368939207482, + "grad_norm": 0.8817281723022461, + "learning_rate": 4.0353095614915286e-06, + "loss": 0.6672, + "step": 4775 + }, + { + "epoch": 1.7631060792517843, + "grad_norm": 0.8874582648277283, + "learning_rate": 4.034925680668757e-06, + "loss": 0.6284, + "step": 4776 + }, + { + "epoch": 1.7634752645828207, + "grad_norm": 0.8786579370498657, + "learning_rate": 4.034541741749533e-06, + "loss": 0.6729, + "step": 4777 + }, + { + "epoch": 1.7638444499138568, + "grad_norm": 0.9291033148765564, + "learning_rate": 4.034157744748389e-06, + "loss": 0.627, + "step": 4778 + }, + { + "epoch": 1.764213635244893, + "grad_norm": 0.8524207472801208, + "learning_rate": 4.0337736896798574e-06, + "loss": 0.6613, + "step": 4779 + }, + { + "epoch": 1.764582820575929, + "grad_norm": 0.8642212748527527, + "learning_rate": 4.0333895765584764e-06, + "loss": 0.666, + "step": 4780 + }, + { + "epoch": 1.7649520059069652, + "grad_norm": 0.9171303510665894, + "learning_rate": 4.033005405398784e-06, + "loss": 0.6553, + "step": 4781 + }, + { + "epoch": 1.7653211912380016, + "grad_norm": 0.8397627472877502, + "learning_rate": 4.03262117621532e-06, + "loss": 0.6291, + "step": 4782 + }, + { + "epoch": 1.7656903765690377, + "grad_norm": 0.8296987414360046, + "learning_rate": 4.03223688902263e-06, + "loss": 0.6331, + "step": 4783 + }, + { + "epoch": 1.7660595619000738, + "grad_norm": 0.8518911600112915, + "learning_rate": 4.031852543835255e-06, + "loss": 0.6657, + "step": 4784 + }, + { + "epoch": 1.76642874723111, + "grad_norm": 0.877257227897644, + "learning_rate": 4.031468140667746e-06, + "loss": 0.688, + "step": 4785 + }, + { + "epoch": 1.7667979325621461, + "grad_norm": 0.9005976915359497, + "learning_rate": 4.031083679534651e-06, + "loss": 0.6316, + "step": 4786 + }, + { + "epoch": 1.7671671178931825, + "grad_norm": 0.8616930246353149, + "learning_rate": 4.030699160450522e-06, + "loss": 0.6403, + "step": 4787 + }, + { + "epoch": 1.7675363032242184, + "grad_norm": 0.852268397808075, + "learning_rate": 4.030314583429913e-06, + "loss": 0.6424, + "step": 4788 + }, + { + "epoch": 1.7679054885552548, + "grad_norm": 0.8587179183959961, + "learning_rate": 4.02992994848738e-06, + "loss": 0.6473, + "step": 4789 + }, + { + "epoch": 1.768274673886291, + "grad_norm": 0.8719492554664612, + "learning_rate": 4.029545255637481e-06, + "loss": 0.6336, + "step": 4790 + }, + { + "epoch": 1.768643859217327, + "grad_norm": 0.8550917506217957, + "learning_rate": 4.029160504894777e-06, + "loss": 0.6258, + "step": 4791 + }, + { + "epoch": 1.7690130445483634, + "grad_norm": 0.8471203446388245, + "learning_rate": 4.02877569627383e-06, + "loss": 0.6334, + "step": 4792 + }, + { + "epoch": 1.7693822298793993, + "grad_norm": 0.8624316453933716, + "learning_rate": 4.028390829789206e-06, + "loss": 0.6414, + "step": 4793 + }, + { + "epoch": 1.7697514152104357, + "grad_norm": 0.8651792407035828, + "learning_rate": 4.028005905455472e-06, + "loss": 0.6621, + "step": 4794 + }, + { + "epoch": 1.7701206005414718, + "grad_norm": 0.8879402279853821, + "learning_rate": 4.027620923287196e-06, + "loss": 0.6324, + "step": 4795 + }, + { + "epoch": 1.770489785872508, + "grad_norm": 0.8610432744026184, + "learning_rate": 4.027235883298949e-06, + "loss": 0.6816, + "step": 4796 + }, + { + "epoch": 1.7708589712035443, + "grad_norm": 0.8857172131538391, + "learning_rate": 4.026850785505308e-06, + "loss": 0.6251, + "step": 4797 + }, + { + "epoch": 1.7712281565345802, + "grad_norm": 0.8402075171470642, + "learning_rate": 4.026465629920845e-06, + "loss": 0.6445, + "step": 4798 + }, + { + "epoch": 1.7715973418656166, + "grad_norm": 0.8597134947776794, + "learning_rate": 4.026080416560141e-06, + "loss": 0.6479, + "step": 4799 + }, + { + "epoch": 1.7719665271966527, + "grad_norm": 0.8565741777420044, + "learning_rate": 4.025695145437774e-06, + "loss": 0.6557, + "step": 4800 + }, + { + "epoch": 1.7723357125276888, + "grad_norm": 0.8824018836021423, + "learning_rate": 4.025309816568327e-06, + "loss": 0.6219, + "step": 4801 + }, + { + "epoch": 1.7727048978587252, + "grad_norm": 0.8586277365684509, + "learning_rate": 4.024924429966386e-06, + "loss": 0.6162, + "step": 4802 + }, + { + "epoch": 1.7730740831897611, + "grad_norm": 0.8624557256698608, + "learning_rate": 4.024538985646536e-06, + "loss": 0.6579, + "step": 4803 + }, + { + "epoch": 1.7734432685207975, + "grad_norm": 0.8990819454193115, + "learning_rate": 4.024153483623366e-06, + "loss": 0.6291, + "step": 4804 + }, + { + "epoch": 1.7738124538518336, + "grad_norm": 0.8472552299499512, + "learning_rate": 4.023767923911468e-06, + "loss": 0.6462, + "step": 4805 + }, + { + "epoch": 1.7741816391828698, + "grad_norm": 0.9162282943725586, + "learning_rate": 4.023382306525435e-06, + "loss": 0.6464, + "step": 4806 + }, + { + "epoch": 1.774550824513906, + "grad_norm": 0.8442236185073853, + "learning_rate": 4.022996631479863e-06, + "loss": 0.6281, + "step": 4807 + }, + { + "epoch": 1.774920009844942, + "grad_norm": 0.8495954871177673, + "learning_rate": 4.022610898789349e-06, + "loss": 0.672, + "step": 4808 + }, + { + "epoch": 1.7752891951759784, + "grad_norm": 0.83000248670578, + "learning_rate": 4.0222251084684925e-06, + "loss": 0.6355, + "step": 4809 + }, + { + "epoch": 1.7756583805070145, + "grad_norm": 0.8509250283241272, + "learning_rate": 4.021839260531896e-06, + "loss": 0.6629, + "step": 4810 + }, + { + "epoch": 1.7760275658380507, + "grad_norm": 0.8949050903320312, + "learning_rate": 4.021453354994164e-06, + "loss": 0.6749, + "step": 4811 + }, + { + "epoch": 1.776396751169087, + "grad_norm": 0.8776379227638245, + "learning_rate": 4.021067391869903e-06, + "loss": 0.676, + "step": 4812 + }, + { + "epoch": 1.776765936500123, + "grad_norm": 0.8830000162124634, + "learning_rate": 4.020681371173721e-06, + "loss": 0.6806, + "step": 4813 + }, + { + "epoch": 1.7771351218311593, + "grad_norm": 0.9037566184997559, + "learning_rate": 4.020295292920229e-06, + "loss": 0.6867, + "step": 4814 + }, + { + "epoch": 1.7775043071621954, + "grad_norm": 0.8821821212768555, + "learning_rate": 4.01990915712404e-06, + "loss": 0.6701, + "step": 4815 + }, + { + "epoch": 1.7778734924932316, + "grad_norm": 0.8942487835884094, + "learning_rate": 4.019522963799769e-06, + "loss": 0.6872, + "step": 4816 + }, + { + "epoch": 1.778242677824268, + "grad_norm": 0.895228385925293, + "learning_rate": 4.019136712962034e-06, + "loss": 0.649, + "step": 4817 + }, + { + "epoch": 1.7786118631553038, + "grad_norm": 0.8838170766830444, + "learning_rate": 4.018750404625453e-06, + "loss": 0.6567, + "step": 4818 + }, + { + "epoch": 1.7789810484863402, + "grad_norm": 0.8944429755210876, + "learning_rate": 4.01836403880465e-06, + "loss": 0.6951, + "step": 4819 + }, + { + "epoch": 1.7793502338173763, + "grad_norm": 0.8805184960365295, + "learning_rate": 4.017977615514246e-06, + "loss": 0.6735, + "step": 4820 + }, + { + "epoch": 1.7797194191484125, + "grad_norm": 0.865036129951477, + "learning_rate": 4.0175911347688684e-06, + "loss": 0.6042, + "step": 4821 + }, + { + "epoch": 1.7800886044794488, + "grad_norm": 0.8644222021102905, + "learning_rate": 4.017204596583145e-06, + "loss": 0.6704, + "step": 4822 + }, + { + "epoch": 1.7804577898104847, + "grad_norm": 0.8980161547660828, + "learning_rate": 4.016818000971707e-06, + "loss": 0.6368, + "step": 4823 + }, + { + "epoch": 1.780826975141521, + "grad_norm": 0.8645391464233398, + "learning_rate": 4.0164313479491865e-06, + "loss": 0.6602, + "step": 4824 + }, + { + "epoch": 1.7811961604725572, + "grad_norm": 0.870156466960907, + "learning_rate": 4.016044637530218e-06, + "loss": 0.6508, + "step": 4825 + }, + { + "epoch": 1.7815653458035934, + "grad_norm": 0.9023107290267944, + "learning_rate": 4.015657869729438e-06, + "loss": 0.6198, + "step": 4826 + }, + { + "epoch": 1.7819345311346297, + "grad_norm": 0.8312729001045227, + "learning_rate": 4.015271044561487e-06, + "loss": 0.6545, + "step": 4827 + }, + { + "epoch": 1.7823037164656657, + "grad_norm": 0.8387230038642883, + "learning_rate": 4.014884162041004e-06, + "loss": 0.6274, + "step": 4828 + }, + { + "epoch": 1.782672901796702, + "grad_norm": 0.8463006019592285, + "learning_rate": 4.014497222182635e-06, + "loss": 0.6432, + "step": 4829 + }, + { + "epoch": 1.7830420871277382, + "grad_norm": 0.860016942024231, + "learning_rate": 4.014110225001024e-06, + "loss": 0.6573, + "step": 4830 + }, + { + "epoch": 1.7834112724587743, + "grad_norm": 0.9237045645713806, + "learning_rate": 4.013723170510817e-06, + "loss": 0.6685, + "step": 4831 + }, + { + "epoch": 1.7837804577898106, + "grad_norm": 0.8977932929992676, + "learning_rate": 4.0133360587266665e-06, + "loss": 0.6423, + "step": 4832 + }, + { + "epoch": 1.7841496431208466, + "grad_norm": 0.8514804244041443, + "learning_rate": 4.012948889663224e-06, + "loss": 0.6557, + "step": 4833 + }, + { + "epoch": 1.784518828451883, + "grad_norm": 0.8666879534721375, + "learning_rate": 4.012561663335144e-06, + "loss": 0.6618, + "step": 4834 + }, + { + "epoch": 1.784888013782919, + "grad_norm": 0.854735255241394, + "learning_rate": 4.012174379757082e-06, + "loss": 0.6375, + "step": 4835 + }, + { + "epoch": 1.7852571991139552, + "grad_norm": 0.8786066770553589, + "learning_rate": 4.011787038943697e-06, + "loss": 0.6261, + "step": 4836 + }, + { + "epoch": 1.7856263844449913, + "grad_norm": 0.852622926235199, + "learning_rate": 4.01139964090965e-06, + "loss": 0.6444, + "step": 4837 + }, + { + "epoch": 1.7859955697760275, + "grad_norm": 0.8683868646621704, + "learning_rate": 4.011012185669603e-06, + "loss": 0.642, + "step": 4838 + }, + { + "epoch": 1.7863647551070638, + "grad_norm": 0.876533567905426, + "learning_rate": 4.010624673238222e-06, + "loss": 0.6369, + "step": 4839 + }, + { + "epoch": 1.7867339404381, + "grad_norm": 0.8687669038772583, + "learning_rate": 4.0102371036301745e-06, + "loss": 0.6723, + "step": 4840 + }, + { + "epoch": 1.787103125769136, + "grad_norm": 0.8760514259338379, + "learning_rate": 4.0098494768601285e-06, + "loss": 0.655, + "step": 4841 + }, + { + "epoch": 1.7874723111001722, + "grad_norm": 0.8729817867279053, + "learning_rate": 4.009461792942756e-06, + "loss": 0.6494, + "step": 4842 + }, + { + "epoch": 1.7878414964312084, + "grad_norm": 0.8553536534309387, + "learning_rate": 4.0090740518927315e-06, + "loss": 0.6302, + "step": 4843 + }, + { + "epoch": 1.7882106817622447, + "grad_norm": 0.8747924566268921, + "learning_rate": 4.008686253724731e-06, + "loss": 0.6485, + "step": 4844 + }, + { + "epoch": 1.7885798670932809, + "grad_norm": 0.8502061367034912, + "learning_rate": 4.008298398453431e-06, + "loss": 0.6576, + "step": 4845 + }, + { + "epoch": 1.788949052424317, + "grad_norm": 0.8652560114860535, + "learning_rate": 4.007910486093513e-06, + "loss": 0.6466, + "step": 4846 + }, + { + "epoch": 1.7893182377553531, + "grad_norm": 0.860003650188446, + "learning_rate": 4.007522516659658e-06, + "loss": 0.6635, + "step": 4847 + }, + { + "epoch": 1.7896874230863893, + "grad_norm": 0.8634912967681885, + "learning_rate": 4.0071344901665535e-06, + "loss": 0.6548, + "step": 4848 + }, + { + "epoch": 1.7900566084174256, + "grad_norm": 0.8772385120391846, + "learning_rate": 4.006746406628883e-06, + "loss": 0.6727, + "step": 4849 + }, + { + "epoch": 1.7904257937484618, + "grad_norm": 0.8333175182342529, + "learning_rate": 4.0063582660613365e-06, + "loss": 0.6599, + "step": 4850 + }, + { + "epoch": 1.790794979079498, + "grad_norm": 0.873171329498291, + "learning_rate": 4.0059700684786055e-06, + "loss": 0.6724, + "step": 4851 + }, + { + "epoch": 1.791164164410534, + "grad_norm": 0.9940029382705688, + "learning_rate": 4.005581813895383e-06, + "loss": 0.6546, + "step": 4852 + }, + { + "epoch": 1.7915333497415702, + "grad_norm": 0.8843750357627869, + "learning_rate": 4.005193502326363e-06, + "loss": 0.7091, + "step": 4853 + }, + { + "epoch": 1.7919025350726066, + "grad_norm": 0.8900793194770813, + "learning_rate": 4.004805133786245e-06, + "loss": 0.643, + "step": 4854 + }, + { + "epoch": 1.7922717204036425, + "grad_norm": 0.867699384689331, + "learning_rate": 4.004416708289727e-06, + "loss": 0.6695, + "step": 4855 + }, + { + "epoch": 1.7926409057346788, + "grad_norm": 0.8790539503097534, + "learning_rate": 4.004028225851511e-06, + "loss": 0.6911, + "step": 4856 + }, + { + "epoch": 1.793010091065715, + "grad_norm": 0.8665317893028259, + "learning_rate": 4.003639686486301e-06, + "loss": 0.6633, + "step": 4857 + }, + { + "epoch": 1.793379276396751, + "grad_norm": 0.8493485450744629, + "learning_rate": 4.003251090208805e-06, + "loss": 0.6664, + "step": 4858 + }, + { + "epoch": 1.7937484617277875, + "grad_norm": 0.8779489398002625, + "learning_rate": 4.002862437033729e-06, + "loss": 0.6521, + "step": 4859 + }, + { + "epoch": 1.7941176470588234, + "grad_norm": 0.8971390724182129, + "learning_rate": 4.0024737269757845e-06, + "loss": 0.6411, + "step": 4860 + }, + { + "epoch": 1.7944868323898597, + "grad_norm": 0.8787785768508911, + "learning_rate": 4.0020849600496826e-06, + "loss": 0.647, + "step": 4861 + }, + { + "epoch": 1.7948560177208959, + "grad_norm": 0.858384370803833, + "learning_rate": 4.00169613627014e-06, + "loss": 0.6709, + "step": 4862 + }, + { + "epoch": 1.795225203051932, + "grad_norm": 0.8787199258804321, + "learning_rate": 4.001307255651871e-06, + "loss": 0.6561, + "step": 4863 + }, + { + "epoch": 1.7955943883829684, + "grad_norm": 0.8710545897483826, + "learning_rate": 4.000918318209598e-06, + "loss": 0.6727, + "step": 4864 + }, + { + "epoch": 1.7959635737140043, + "grad_norm": 0.8709081411361694, + "learning_rate": 4.00052932395804e-06, + "loss": 0.6786, + "step": 4865 + }, + { + "epoch": 1.7963327590450406, + "grad_norm": 0.938996434211731, + "learning_rate": 4.0001402729119195e-06, + "loss": 0.6274, + "step": 4866 + }, + { + "epoch": 1.7967019443760768, + "grad_norm": 0.8435667157173157, + "learning_rate": 3.9997511650859645e-06, + "loss": 0.6504, + "step": 4867 + }, + { + "epoch": 1.797071129707113, + "grad_norm": 0.8694499135017395, + "learning_rate": 3.999362000494901e-06, + "loss": 0.6615, + "step": 4868 + }, + { + "epoch": 1.7974403150381493, + "grad_norm": 0.8645815253257751, + "learning_rate": 3.9989727791534584e-06, + "loss": 0.6168, + "step": 4869 + }, + { + "epoch": 1.7978095003691852, + "grad_norm": 0.8785998821258545, + "learning_rate": 3.998583501076369e-06, + "loss": 0.6541, + "step": 4870 + }, + { + "epoch": 1.7981786857002215, + "grad_norm": 0.8967954516410828, + "learning_rate": 3.9981941662783675e-06, + "loss": 0.6679, + "step": 4871 + }, + { + "epoch": 1.7985478710312577, + "grad_norm": 0.8639390468597412, + "learning_rate": 3.99780477477419e-06, + "loss": 0.6455, + "step": 4872 + }, + { + "epoch": 1.7989170563622938, + "grad_norm": 0.8752289414405823, + "learning_rate": 3.997415326578574e-06, + "loss": 0.6576, + "step": 4873 + }, + { + "epoch": 1.7992862416933302, + "grad_norm": 0.8684571385383606, + "learning_rate": 3.997025821706261e-06, + "loss": 0.6165, + "step": 4874 + }, + { + "epoch": 1.799655427024366, + "grad_norm": 0.846998393535614, + "learning_rate": 3.996636260171992e-06, + "loss": 0.6739, + "step": 4875 + }, + { + "epoch": 1.8000246123554025, + "grad_norm": 0.8546062111854553, + "learning_rate": 3.996246641990514e-06, + "loss": 0.6272, + "step": 4876 + }, + { + "epoch": 1.8003937976864386, + "grad_norm": 0.8490734100341797, + "learning_rate": 3.995856967176572e-06, + "loss": 0.6414, + "step": 4877 + }, + { + "epoch": 1.8007629830174747, + "grad_norm": 0.8480061888694763, + "learning_rate": 3.995467235744917e-06, + "loss": 0.6622, + "step": 4878 + }, + { + "epoch": 1.801132168348511, + "grad_norm": 0.8799357414245605, + "learning_rate": 3.995077447710298e-06, + "loss": 0.6658, + "step": 4879 + }, + { + "epoch": 1.801501353679547, + "grad_norm": 0.8820101618766785, + "learning_rate": 3.994687603087471e-06, + "loss": 0.6478, + "step": 4880 + }, + { + "epoch": 1.8018705390105834, + "grad_norm": 0.8734553456306458, + "learning_rate": 3.994297701891188e-06, + "loss": 0.6469, + "step": 4881 + }, + { + "epoch": 1.8022397243416195, + "grad_norm": 0.8651158809661865, + "learning_rate": 3.99390774413621e-06, + "loss": 0.6284, + "step": 4882 + }, + { + "epoch": 1.8026089096726556, + "grad_norm": 0.902680516242981, + "learning_rate": 3.993517729837294e-06, + "loss": 0.6612, + "step": 4883 + }, + { + "epoch": 1.802978095003692, + "grad_norm": 0.8663744330406189, + "learning_rate": 3.993127659009205e-06, + "loss": 0.6244, + "step": 4884 + }, + { + "epoch": 1.803347280334728, + "grad_norm": 0.8921365141868591, + "learning_rate": 3.992737531666704e-06, + "loss": 0.642, + "step": 4885 + }, + { + "epoch": 1.8037164656657643, + "grad_norm": 0.8933938145637512, + "learning_rate": 3.992347347824559e-06, + "loss": 0.6597, + "step": 4886 + }, + { + "epoch": 1.8040856509968004, + "grad_norm": 0.8590527772903442, + "learning_rate": 3.991957107497537e-06, + "loss": 0.6498, + "step": 4887 + }, + { + "epoch": 1.8044548363278365, + "grad_norm": 0.8795635104179382, + "learning_rate": 3.991566810700411e-06, + "loss": 0.644, + "step": 4888 + }, + { + "epoch": 1.804824021658873, + "grad_norm": 0.8671436905860901, + "learning_rate": 3.9911764574479505e-06, + "loss": 0.6602, + "step": 4889 + }, + { + "epoch": 1.8051932069899088, + "grad_norm": 0.832854151725769, + "learning_rate": 3.990786047754933e-06, + "loss": 0.6629, + "step": 4890 + }, + { + "epoch": 1.8055623923209452, + "grad_norm": 0.8766759037971497, + "learning_rate": 3.9903955816361335e-06, + "loss": 0.6488, + "step": 4891 + }, + { + "epoch": 1.8059315776519813, + "grad_norm": 0.8763872981071472, + "learning_rate": 3.990005059106332e-06, + "loss": 0.6598, + "step": 4892 + }, + { + "epoch": 1.8063007629830174, + "grad_norm": 0.8656853437423706, + "learning_rate": 3.989614480180309e-06, + "loss": 0.6412, + "step": 4893 + }, + { + "epoch": 1.8066699483140538, + "grad_norm": 0.8918483257293701, + "learning_rate": 3.9892238448728475e-06, + "loss": 0.6503, + "step": 4894 + }, + { + "epoch": 1.8070391336450897, + "grad_norm": 0.8660863637924194, + "learning_rate": 3.9888331531987335e-06, + "loss": 0.6519, + "step": 4895 + }, + { + "epoch": 1.807408318976126, + "grad_norm": 0.8563582301139832, + "learning_rate": 3.988442405172755e-06, + "loss": 0.654, + "step": 4896 + }, + { + "epoch": 1.8077775043071622, + "grad_norm": 0.8846266865730286, + "learning_rate": 3.988051600809701e-06, + "loss": 0.664, + "step": 4897 + }, + { + "epoch": 1.8081466896381984, + "grad_norm": 0.8938320279121399, + "learning_rate": 3.987660740124364e-06, + "loss": 0.6596, + "step": 4898 + }, + { + "epoch": 1.8085158749692347, + "grad_norm": 0.8648285865783691, + "learning_rate": 3.987269823131537e-06, + "loss": 0.6469, + "step": 4899 + }, + { + "epoch": 1.8088850603002706, + "grad_norm": 0.8952844142913818, + "learning_rate": 3.986878849846016e-06, + "loss": 0.6333, + "step": 4900 + }, + { + "epoch": 1.809254245631307, + "grad_norm": 0.8702192306518555, + "learning_rate": 3.986487820282602e-06, + "loss": 0.6648, + "step": 4901 + }, + { + "epoch": 1.8096234309623431, + "grad_norm": 0.8681883215904236, + "learning_rate": 3.986096734456092e-06, + "loss": 0.6699, + "step": 4902 + }, + { + "epoch": 1.8099926162933793, + "grad_norm": 0.865635335445404, + "learning_rate": 3.985705592381289e-06, + "loss": 0.6385, + "step": 4903 + }, + { + "epoch": 1.8103618016244156, + "grad_norm": 0.8648701310157776, + "learning_rate": 3.985314394072997e-06, + "loss": 0.6496, + "step": 4904 + }, + { + "epoch": 1.8107309869554515, + "grad_norm": 0.8681031465530396, + "learning_rate": 3.984923139546026e-06, + "loss": 0.6481, + "step": 4905 + }, + { + "epoch": 1.811100172286488, + "grad_norm": 0.8878256678581238, + "learning_rate": 3.984531828815183e-06, + "loss": 0.6466, + "step": 4906 + }, + { + "epoch": 1.811469357617524, + "grad_norm": 0.8278356790542603, + "learning_rate": 3.984140461895278e-06, + "loss": 0.6404, + "step": 4907 + }, + { + "epoch": 1.8118385429485602, + "grad_norm": 0.8608059287071228, + "learning_rate": 3.9837490388011245e-06, + "loss": 0.647, + "step": 4908 + }, + { + "epoch": 1.8122077282795963, + "grad_norm": 0.8612789511680603, + "learning_rate": 3.98335755954754e-06, + "loss": 0.6857, + "step": 4909 + }, + { + "epoch": 1.8125769136106324, + "grad_norm": 0.9022255539894104, + "learning_rate": 3.982966024149338e-06, + "loss": 0.6531, + "step": 4910 + }, + { + "epoch": 1.8129460989416688, + "grad_norm": 0.8742957711219788, + "learning_rate": 3.982574432621341e-06, + "loss": 0.6544, + "step": 4911 + }, + { + "epoch": 1.813315284272705, + "grad_norm": 0.8770278692245483, + "learning_rate": 3.98218278497837e-06, + "loss": 0.6411, + "step": 4912 + }, + { + "epoch": 1.813684469603741, + "grad_norm": 0.8758889436721802, + "learning_rate": 3.981791081235246e-06, + "loss": 0.69, + "step": 4913 + }, + { + "epoch": 1.8140536549347772, + "grad_norm": 0.8833830952644348, + "learning_rate": 3.9813993214068e-06, + "loss": 0.6723, + "step": 4914 + }, + { + "epoch": 1.8144228402658134, + "grad_norm": 0.8730379343032837, + "learning_rate": 3.9810075055078565e-06, + "loss": 0.6513, + "step": 4915 + }, + { + "epoch": 1.8147920255968497, + "grad_norm": 0.8649890422821045, + "learning_rate": 3.980615633553246e-06, + "loss": 0.6762, + "step": 4916 + }, + { + "epoch": 1.8151612109278858, + "grad_norm": 0.8449490666389465, + "learning_rate": 3.980223705557801e-06, + "loss": 0.6146, + "step": 4917 + }, + { + "epoch": 1.815530396258922, + "grad_norm": 0.8612676858901978, + "learning_rate": 3.979831721536355e-06, + "loss": 0.6725, + "step": 4918 + }, + { + "epoch": 1.8158995815899581, + "grad_norm": 0.8917443156242371, + "learning_rate": 3.979439681503747e-06, + "loss": 0.6229, + "step": 4919 + }, + { + "epoch": 1.8162687669209943, + "grad_norm": 0.8380262851715088, + "learning_rate": 3.979047585474813e-06, + "loss": 0.5828, + "step": 4920 + }, + { + "epoch": 1.8166379522520306, + "grad_norm": 0.8867169618606567, + "learning_rate": 3.978655433464395e-06, + "loss": 0.6388, + "step": 4921 + }, + { + "epoch": 1.8170071375830665, + "grad_norm": 0.8853972554206848, + "learning_rate": 3.978263225487337e-06, + "loss": 0.6603, + "step": 4922 + }, + { + "epoch": 1.817376322914103, + "grad_norm": 0.8818216919898987, + "learning_rate": 3.97787096155848e-06, + "loss": 0.6386, + "step": 4923 + }, + { + "epoch": 1.817745508245139, + "grad_norm": 0.891826868057251, + "learning_rate": 3.977478641692675e-06, + "loss": 0.6339, + "step": 4924 + }, + { + "epoch": 1.8181146935761752, + "grad_norm": 0.8542366623878479, + "learning_rate": 3.977086265904769e-06, + "loss": 0.6419, + "step": 4925 + }, + { + "epoch": 1.8184838789072115, + "grad_norm": 0.8567746877670288, + "learning_rate": 3.976693834209614e-06, + "loss": 0.6653, + "step": 4926 + }, + { + "epoch": 1.8188530642382474, + "grad_norm": 0.873786985874176, + "learning_rate": 3.976301346622064e-06, + "loss": 0.6555, + "step": 4927 + }, + { + "epoch": 1.8192222495692838, + "grad_norm": 0.8266376852989197, + "learning_rate": 3.975908803156974e-06, + "loss": 0.5917, + "step": 4928 + }, + { + "epoch": 1.81959143490032, + "grad_norm": 0.8157034516334534, + "learning_rate": 3.975516203829201e-06, + "loss": 0.6232, + "step": 4929 + }, + { + "epoch": 1.819960620231356, + "grad_norm": 0.8456475138664246, + "learning_rate": 3.975123548653606e-06, + "loss": 0.6426, + "step": 4930 + }, + { + "epoch": 1.8203298055623924, + "grad_norm": 0.8957807421684265, + "learning_rate": 3.97473083764505e-06, + "loss": 0.6848, + "step": 4931 + }, + { + "epoch": 1.8206989908934283, + "grad_norm": 0.8761498332023621, + "learning_rate": 3.974338070818397e-06, + "loss": 0.6474, + "step": 4932 + }, + { + "epoch": 1.8210681762244647, + "grad_norm": 0.8844765424728394, + "learning_rate": 3.973945248188513e-06, + "loss": 0.6521, + "step": 4933 + }, + { + "epoch": 1.8214373615555008, + "grad_norm": 0.8457323908805847, + "learning_rate": 3.973552369770267e-06, + "loss": 0.6271, + "step": 4934 + }, + { + "epoch": 1.821806546886537, + "grad_norm": 0.8489996194839478, + "learning_rate": 3.973159435578529e-06, + "loss": 0.6291, + "step": 4935 + }, + { + "epoch": 1.8221757322175733, + "grad_norm": 0.8581608533859253, + "learning_rate": 3.972766445628171e-06, + "loss": 0.634, + "step": 4936 + }, + { + "epoch": 1.8225449175486093, + "grad_norm": 0.8876478672027588, + "learning_rate": 3.972373399934068e-06, + "loss": 0.6668, + "step": 4937 + }, + { + "epoch": 1.8229141028796456, + "grad_norm": 0.8752370476722717, + "learning_rate": 3.971980298511097e-06, + "loss": 0.6963, + "step": 4938 + }, + { + "epoch": 1.8232832882106818, + "grad_norm": 0.863472044467926, + "learning_rate": 3.971587141374136e-06, + "loss": 0.6331, + "step": 4939 + }, + { + "epoch": 1.8236524735417179, + "grad_norm": 0.8516963124275208, + "learning_rate": 3.971193928538065e-06, + "loss": 0.626, + "step": 4940 + }, + { + "epoch": 1.8240216588727542, + "grad_norm": 0.8794181942939758, + "learning_rate": 3.9708006600177695e-06, + "loss": 0.6412, + "step": 4941 + }, + { + "epoch": 1.8243908442037902, + "grad_norm": 0.8745574355125427, + "learning_rate": 3.970407335828132e-06, + "loss": 0.6115, + "step": 4942 + }, + { + "epoch": 1.8247600295348265, + "grad_norm": 0.8484980463981628, + "learning_rate": 3.970013955984042e-06, + "loss": 0.6516, + "step": 4943 + }, + { + "epoch": 1.8251292148658627, + "grad_norm": 0.8250308036804199, + "learning_rate": 3.969620520500388e-06, + "loss": 0.6337, + "step": 4944 + }, + { + "epoch": 1.8254984001968988, + "grad_norm": 0.8786753416061401, + "learning_rate": 3.96922702939206e-06, + "loss": 0.6394, + "step": 4945 + }, + { + "epoch": 1.8258675855279352, + "grad_norm": 0.8524734377861023, + "learning_rate": 3.968833482673953e-06, + "loss": 0.6507, + "step": 4946 + }, + { + "epoch": 1.826236770858971, + "grad_norm": 0.8850034475326538, + "learning_rate": 3.968439880360963e-06, + "loss": 0.6608, + "step": 4947 + }, + { + "epoch": 1.8266059561900074, + "grad_norm": 0.8589314222335815, + "learning_rate": 3.968046222467986e-06, + "loss": 0.6926, + "step": 4948 + }, + { + "epoch": 1.8269751415210436, + "grad_norm": 0.8712340593338013, + "learning_rate": 3.967652509009923e-06, + "loss": 0.6333, + "step": 4949 + }, + { + "epoch": 1.8273443268520797, + "grad_norm": 0.8626800179481506, + "learning_rate": 3.967258740001676e-06, + "loss": 0.6118, + "step": 4950 + }, + { + "epoch": 1.827713512183116, + "grad_norm": 0.8229546546936035, + "learning_rate": 3.966864915458148e-06, + "loss": 0.6661, + "step": 4951 + }, + { + "epoch": 1.828082697514152, + "grad_norm": 0.8721511363983154, + "learning_rate": 3.966471035394247e-06, + "loss": 0.6772, + "step": 4952 + }, + { + "epoch": 1.8284518828451883, + "grad_norm": 0.8818221688270569, + "learning_rate": 3.96607709982488e-06, + "loss": 0.6765, + "step": 4953 + }, + { + "epoch": 1.8288210681762245, + "grad_norm": 0.8656367063522339, + "learning_rate": 3.9656831087649564e-06, + "loss": 0.6467, + "step": 4954 + }, + { + "epoch": 1.8291902535072606, + "grad_norm": 0.854407548904419, + "learning_rate": 3.96528906222939e-06, + "loss": 0.6294, + "step": 4955 + }, + { + "epoch": 1.829559438838297, + "grad_norm": 0.8640506863594055, + "learning_rate": 3.964894960233095e-06, + "loss": 0.6391, + "step": 4956 + }, + { + "epoch": 1.8299286241693329, + "grad_norm": 0.82984858751297, + "learning_rate": 3.964500802790989e-06, + "loss": 0.6353, + "step": 4957 + }, + { + "epoch": 1.8302978095003692, + "grad_norm": 0.86293625831604, + "learning_rate": 3.96410658991799e-06, + "loss": 0.6126, + "step": 4958 + }, + { + "epoch": 1.8306669948314054, + "grad_norm": 0.8769557476043701, + "learning_rate": 3.9637123216290185e-06, + "loss": 0.6591, + "step": 4959 + }, + { + "epoch": 1.8310361801624415, + "grad_norm": 0.8608360886573792, + "learning_rate": 3.963317997938997e-06, + "loss": 0.6716, + "step": 4960 + }, + { + "epoch": 1.8314053654934779, + "grad_norm": 0.864945113658905, + "learning_rate": 3.962923618862851e-06, + "loss": 0.6371, + "step": 4961 + }, + { + "epoch": 1.8317745508245138, + "grad_norm": 0.8723883032798767, + "learning_rate": 3.9625291844155075e-06, + "loss": 0.6814, + "step": 4962 + }, + { + "epoch": 1.8321437361555502, + "grad_norm": 0.8627622723579407, + "learning_rate": 3.962134694611896e-06, + "loss": 0.6384, + "step": 4963 + }, + { + "epoch": 1.8325129214865863, + "grad_norm": 0.9421527981758118, + "learning_rate": 3.961740149466949e-06, + "loss": 0.6673, + "step": 4964 + }, + { + "epoch": 1.8328821068176224, + "grad_norm": 0.9081157445907593, + "learning_rate": 3.961345548995598e-06, + "loss": 0.6233, + "step": 4965 + }, + { + "epoch": 1.8332512921486588, + "grad_norm": 0.8545625805854797, + "learning_rate": 3.960950893212778e-06, + "loss": 0.6812, + "step": 4966 + }, + { + "epoch": 1.8336204774796947, + "grad_norm": 0.86126708984375, + "learning_rate": 3.960556182133429e-06, + "loss": 0.6601, + "step": 4967 + }, + { + "epoch": 1.833989662810731, + "grad_norm": 0.8778650760650635, + "learning_rate": 3.96016141577249e-06, + "loss": 0.6566, + "step": 4968 + }, + { + "epoch": 1.8343588481417672, + "grad_norm": 0.8289297819137573, + "learning_rate": 3.9597665941449014e-06, + "loss": 0.6626, + "step": 4969 + }, + { + "epoch": 1.8347280334728033, + "grad_norm": 0.8638941049575806, + "learning_rate": 3.959371717265608e-06, + "loss": 0.6501, + "step": 4970 + }, + { + "epoch": 1.8350972188038397, + "grad_norm": 0.8619229793548584, + "learning_rate": 3.958976785149555e-06, + "loss": 0.652, + "step": 4971 + }, + { + "epoch": 1.8354664041348756, + "grad_norm": 0.8610534071922302, + "learning_rate": 3.958581797811693e-06, + "loss": 0.6089, + "step": 4972 + }, + { + "epoch": 1.835835589465912, + "grad_norm": 0.8480890989303589, + "learning_rate": 3.958186755266969e-06, + "loss": 0.663, + "step": 4973 + }, + { + "epoch": 1.836204774796948, + "grad_norm": 0.8583888411521912, + "learning_rate": 3.957791657530337e-06, + "loss": 0.6264, + "step": 4974 + }, + { + "epoch": 1.8365739601279842, + "grad_norm": 0.848208487033844, + "learning_rate": 3.95739650461675e-06, + "loss": 0.6517, + "step": 4975 + }, + { + "epoch": 1.8369431454590204, + "grad_norm": 0.8883641362190247, + "learning_rate": 3.957001296541165e-06, + "loss": 0.6416, + "step": 4976 + }, + { + "epoch": 1.8373123307900565, + "grad_norm": 0.8472451567649841, + "learning_rate": 3.956606033318542e-06, + "loss": 0.6566, + "step": 4977 + }, + { + "epoch": 1.8376815161210929, + "grad_norm": 0.8469805121421814, + "learning_rate": 3.9562107149638405e-06, + "loss": 0.634, + "step": 4978 + }, + { + "epoch": 1.838050701452129, + "grad_norm": 0.8471964001655579, + "learning_rate": 3.955815341492022e-06, + "loss": 0.6835, + "step": 4979 + }, + { + "epoch": 1.8384198867831651, + "grad_norm": 0.9064168334007263, + "learning_rate": 3.955419912918053e-06, + "loss": 0.6188, + "step": 4980 + }, + { + "epoch": 1.8387890721142013, + "grad_norm": 0.8929723501205444, + "learning_rate": 3.955024429256899e-06, + "loss": 0.6872, + "step": 4981 + }, + { + "epoch": 1.8391582574452374, + "grad_norm": 0.9227088689804077, + "learning_rate": 3.95462889052353e-06, + "loss": 0.6665, + "step": 4982 + }, + { + "epoch": 1.8395274427762738, + "grad_norm": 0.8746890425682068, + "learning_rate": 3.954233296732916e-06, + "loss": 0.6625, + "step": 4983 + }, + { + "epoch": 1.83989662810731, + "grad_norm": 0.8971319794654846, + "learning_rate": 3.953837647900031e-06, + "loss": 0.6592, + "step": 4984 + }, + { + "epoch": 1.840265813438346, + "grad_norm": 0.8558131456375122, + "learning_rate": 3.953441944039851e-06, + "loss": 0.6356, + "step": 4985 + }, + { + "epoch": 1.8406349987693822, + "grad_norm": 0.8695920705795288, + "learning_rate": 3.953046185167351e-06, + "loss": 0.6599, + "step": 4986 + }, + { + "epoch": 1.8410041841004183, + "grad_norm": 0.8566515445709229, + "learning_rate": 3.952650371297512e-06, + "loss": 0.662, + "step": 4987 + }, + { + "epoch": 1.8413733694314547, + "grad_norm": 0.8398551940917969, + "learning_rate": 3.952254502445314e-06, + "loss": 0.6431, + "step": 4988 + }, + { + "epoch": 1.8417425547624908, + "grad_norm": 0.8434669375419617, + "learning_rate": 3.951858578625742e-06, + "loss": 0.6363, + "step": 4989 + }, + { + "epoch": 1.842111740093527, + "grad_norm": 0.8622225522994995, + "learning_rate": 3.951462599853781e-06, + "loss": 0.6575, + "step": 4990 + }, + { + "epoch": 1.842480925424563, + "grad_norm": 0.8520545363426208, + "learning_rate": 3.9510665661444185e-06, + "loss": 0.6, + "step": 4991 + }, + { + "epoch": 1.8428501107555992, + "grad_norm": 0.8382810950279236, + "learning_rate": 3.950670477512645e-06, + "loss": 0.6058, + "step": 4992 + }, + { + "epoch": 1.8432192960866356, + "grad_norm": 0.8708922863006592, + "learning_rate": 3.950274333973451e-06, + "loss": 0.6467, + "step": 4993 + }, + { + "epoch": 1.8435884814176715, + "grad_norm": 0.8939244747161865, + "learning_rate": 3.949878135541832e-06, + "loss": 0.6347, + "step": 4994 + }, + { + "epoch": 1.8439576667487079, + "grad_norm": 0.8539683222770691, + "learning_rate": 3.949481882232782e-06, + "loss": 0.6456, + "step": 4995 + }, + { + "epoch": 1.844326852079744, + "grad_norm": 0.885854959487915, + "learning_rate": 3.949085574061301e-06, + "loss": 0.6414, + "step": 4996 + }, + { + "epoch": 1.8446960374107801, + "grad_norm": 0.8656726479530334, + "learning_rate": 3.948689211042389e-06, + "loss": 0.6431, + "step": 4997 + }, + { + "epoch": 1.8450652227418165, + "grad_norm": 0.846147894859314, + "learning_rate": 3.948292793191046e-06, + "loss": 0.6538, + "step": 4998 + }, + { + "epoch": 1.8454344080728524, + "grad_norm": 0.8988797068595886, + "learning_rate": 3.947896320522279e-06, + "loss": 0.6625, + "step": 4999 + }, + { + "epoch": 1.8458035934038888, + "grad_norm": 0.8584328889846802, + "learning_rate": 3.9474997930510925e-06, + "loss": 0.6279, + "step": 5000 + }, + { + "epoch": 1.846172778734925, + "grad_norm": 0.8658866286277771, + "learning_rate": 3.947103210792497e-06, + "loss": 0.6597, + "step": 5001 + }, + { + "epoch": 1.846541964065961, + "grad_norm": 0.8743900656700134, + "learning_rate": 3.9467065737615e-06, + "loss": 0.7044, + "step": 5002 + }, + { + "epoch": 1.8469111493969974, + "grad_norm": 0.8586175441741943, + "learning_rate": 3.946309881973117e-06, + "loss": 0.6335, + "step": 5003 + }, + { + "epoch": 1.8472803347280333, + "grad_norm": 0.851315438747406, + "learning_rate": 3.945913135442361e-06, + "loss": 0.6437, + "step": 5004 + }, + { + "epoch": 1.8476495200590697, + "grad_norm": 0.8587411642074585, + "learning_rate": 3.94551633418425e-06, + "loss": 0.6427, + "step": 5005 + }, + { + "epoch": 1.8480187053901058, + "grad_norm": 0.8425794839859009, + "learning_rate": 3.9451194782138e-06, + "loss": 0.6202, + "step": 5006 + }, + { + "epoch": 1.848387890721142, + "grad_norm": 0.8616281151771545, + "learning_rate": 3.9447225675460355e-06, + "loss": 0.6375, + "step": 5007 + }, + { + "epoch": 1.8487570760521783, + "grad_norm": 0.8673872947692871, + "learning_rate": 3.944325602195977e-06, + "loss": 0.6804, + "step": 5008 + }, + { + "epoch": 1.8491262613832142, + "grad_norm": 0.8959752321243286, + "learning_rate": 3.9439285821786504e-06, + "loss": 0.6438, + "step": 5009 + }, + { + "epoch": 1.8494954467142506, + "grad_norm": 0.85552579164505, + "learning_rate": 3.943531507509083e-06, + "loss": 0.6563, + "step": 5010 + }, + { + "epoch": 1.8498646320452867, + "grad_norm": 0.8582909107208252, + "learning_rate": 3.943134378202303e-06, + "loss": 0.6311, + "step": 5011 + }, + { + "epoch": 1.8502338173763229, + "grad_norm": 0.8698206543922424, + "learning_rate": 3.942737194273343e-06, + "loss": 0.6952, + "step": 5012 + }, + { + "epoch": 1.8506030027073592, + "grad_norm": 0.8811987042427063, + "learning_rate": 3.942339955737235e-06, + "loss": 0.6248, + "step": 5013 + }, + { + "epoch": 1.8509721880383951, + "grad_norm": 0.846393346786499, + "learning_rate": 3.941942662609015e-06, + "loss": 0.6199, + "step": 5014 + }, + { + "epoch": 1.8513413733694315, + "grad_norm": 0.8412083387374878, + "learning_rate": 3.941545314903721e-06, + "loss": 0.6711, + "step": 5015 + }, + { + "epoch": 1.8517105587004676, + "grad_norm": 0.8848696947097778, + "learning_rate": 3.941147912636391e-06, + "loss": 0.6717, + "step": 5016 + }, + { + "epoch": 1.8520797440315038, + "grad_norm": 0.8442311882972717, + "learning_rate": 3.9407504558220675e-06, + "loss": 0.5917, + "step": 5017 + }, + { + "epoch": 1.8524489293625401, + "grad_norm": 0.8754438757896423, + "learning_rate": 3.940352944475794e-06, + "loss": 0.6428, + "step": 5018 + }, + { + "epoch": 1.852818114693576, + "grad_norm": 0.8469083309173584, + "learning_rate": 3.939955378612617e-06, + "loss": 0.657, + "step": 5019 + }, + { + "epoch": 1.8531873000246124, + "grad_norm": 0.8899303674697876, + "learning_rate": 3.939557758247582e-06, + "loss": 0.6696, + "step": 5020 + }, + { + "epoch": 1.8535564853556485, + "grad_norm": 0.8740987777709961, + "learning_rate": 3.93916008339574e-06, + "loss": 0.616, + "step": 5021 + }, + { + "epoch": 1.8539256706866847, + "grad_norm": 0.8593077659606934, + "learning_rate": 3.938762354072144e-06, + "loss": 0.6367, + "step": 5022 + }, + { + "epoch": 1.854294856017721, + "grad_norm": 0.869674801826477, + "learning_rate": 3.938364570291847e-06, + "loss": 0.6365, + "step": 5023 + }, + { + "epoch": 1.854664041348757, + "grad_norm": 0.8814406991004944, + "learning_rate": 3.937966732069904e-06, + "loss": 0.6305, + "step": 5024 + }, + { + "epoch": 1.8550332266797933, + "grad_norm": 0.8666680455207825, + "learning_rate": 3.9375688394213755e-06, + "loss": 0.7098, + "step": 5025 + }, + { + "epoch": 1.8554024120108294, + "grad_norm": 0.872586727142334, + "learning_rate": 3.93717089236132e-06, + "loss": 0.6354, + "step": 5026 + }, + { + "epoch": 1.8557715973418656, + "grad_norm": 0.8778074979782104, + "learning_rate": 3.936772890904798e-06, + "loss": 0.6666, + "step": 5027 + }, + { + "epoch": 1.856140782672902, + "grad_norm": 0.850385308265686, + "learning_rate": 3.936374835066877e-06, + "loss": 0.6483, + "step": 5028 + }, + { + "epoch": 1.8565099680039379, + "grad_norm": 0.8686352372169495, + "learning_rate": 3.935976724862622e-06, + "loss": 0.6421, + "step": 5029 + }, + { + "epoch": 1.8568791533349742, + "grad_norm": 0.839472234249115, + "learning_rate": 3.9355785603071005e-06, + "loss": 0.6222, + "step": 5030 + }, + { + "epoch": 1.8572483386660104, + "grad_norm": 0.8695825338363647, + "learning_rate": 3.935180341415383e-06, + "loss": 0.6337, + "step": 5031 + }, + { + "epoch": 1.8576175239970465, + "grad_norm": 0.8531126379966736, + "learning_rate": 3.934782068202543e-06, + "loss": 0.6279, + "step": 5032 + }, + { + "epoch": 1.8579867093280829, + "grad_norm": 0.9101848006248474, + "learning_rate": 3.9343837406836545e-06, + "loss": 0.6585, + "step": 5033 + }, + { + "epoch": 1.8583558946591188, + "grad_norm": 0.8672534227371216, + "learning_rate": 3.933985358873795e-06, + "loss": 0.6445, + "step": 5034 + }, + { + "epoch": 1.8587250799901551, + "grad_norm": 0.8919646739959717, + "learning_rate": 3.933586922788042e-06, + "loss": 0.6509, + "step": 5035 + }, + { + "epoch": 1.8590942653211913, + "grad_norm": 0.9038968682289124, + "learning_rate": 3.933188432441476e-06, + "loss": 0.6692, + "step": 5036 + }, + { + "epoch": 1.8594634506522274, + "grad_norm": 0.8343707323074341, + "learning_rate": 3.93278988784918e-06, + "loss": 0.6713, + "step": 5037 + }, + { + "epoch": 1.8598326359832638, + "grad_norm": 0.8748628497123718, + "learning_rate": 3.932391289026239e-06, + "loss": 0.6689, + "step": 5038 + }, + { + "epoch": 1.8602018213142997, + "grad_norm": 0.8964290022850037, + "learning_rate": 3.931992635987741e-06, + "loss": 0.6846, + "step": 5039 + }, + { + "epoch": 1.860571006645336, + "grad_norm": 0.8588595390319824, + "learning_rate": 3.931593928748772e-06, + "loss": 0.6731, + "step": 5040 + }, + { + "epoch": 1.8609401919763722, + "grad_norm": 0.9072198271751404, + "learning_rate": 3.931195167324425e-06, + "loss": 0.6817, + "step": 5041 + }, + { + "epoch": 1.8613093773074083, + "grad_norm": 0.9468917846679688, + "learning_rate": 3.930796351729793e-06, + "loss": 0.6144, + "step": 5042 + }, + { + "epoch": 1.8616785626384444, + "grad_norm": 0.8823729753494263, + "learning_rate": 3.930397481979971e-06, + "loss": 0.6986, + "step": 5043 + }, + { + "epoch": 1.8620477479694806, + "grad_norm": 0.8857537508010864, + "learning_rate": 3.9299985580900546e-06, + "loss": 0.649, + "step": 5044 + }, + { + "epoch": 1.862416933300517, + "grad_norm": 0.862916886806488, + "learning_rate": 3.929599580075146e-06, + "loss": 0.6367, + "step": 5045 + }, + { + "epoch": 1.862786118631553, + "grad_norm": 0.8691606521606445, + "learning_rate": 3.929200547950342e-06, + "loss": 0.6997, + "step": 5046 + }, + { + "epoch": 1.8631553039625892, + "grad_norm": 0.877659022808075, + "learning_rate": 3.928801461730749e-06, + "loss": 0.6627, + "step": 5047 + }, + { + "epoch": 1.8635244892936254, + "grad_norm": 0.8542608022689819, + "learning_rate": 3.9284023214314724e-06, + "loss": 0.6264, + "step": 5048 + }, + { + "epoch": 1.8638936746246615, + "grad_norm": 0.8850525617599487, + "learning_rate": 3.928003127067619e-06, + "loss": 0.6482, + "step": 5049 + }, + { + "epoch": 1.8642628599556978, + "grad_norm": 0.8699657320976257, + "learning_rate": 3.927603878654297e-06, + "loss": 0.6288, + "step": 5050 + }, + { + "epoch": 1.864632045286734, + "grad_norm": 0.9085142612457275, + "learning_rate": 3.927204576206619e-06, + "loss": 0.6599, + "step": 5051 + }, + { + "epoch": 1.8650012306177701, + "grad_norm": 0.8851845860481262, + "learning_rate": 3.926805219739699e-06, + "loss": 0.6163, + "step": 5052 + }, + { + "epoch": 1.8653704159488063, + "grad_norm": 0.8605356216430664, + "learning_rate": 3.9264058092686505e-06, + "loss": 0.6495, + "step": 5053 + }, + { + "epoch": 1.8657396012798424, + "grad_norm": 0.880776584148407, + "learning_rate": 3.926006344808593e-06, + "loss": 0.6432, + "step": 5054 + }, + { + "epoch": 1.8661087866108788, + "grad_norm": 0.8991515636444092, + "learning_rate": 3.925606826374645e-06, + "loss": 0.6808, + "step": 5055 + }, + { + "epoch": 1.866477971941915, + "grad_norm": 0.8794136643409729, + "learning_rate": 3.92520725398193e-06, + "loss": 0.6046, + "step": 5056 + }, + { + "epoch": 1.866847157272951, + "grad_norm": 0.8678949475288391, + "learning_rate": 3.924807627645569e-06, + "loss": 0.7026, + "step": 5057 + }, + { + "epoch": 1.8672163426039872, + "grad_norm": 0.8510278463363647, + "learning_rate": 3.924407947380688e-06, + "loss": 0.6528, + "step": 5058 + }, + { + "epoch": 1.8675855279350233, + "grad_norm": 0.8878472447395325, + "learning_rate": 3.924008213202417e-06, + "loss": 0.6447, + "step": 5059 + }, + { + "epoch": 1.8679547132660597, + "grad_norm": 0.8474279046058655, + "learning_rate": 3.923608425125885e-06, + "loss": 0.6358, + "step": 5060 + }, + { + "epoch": 1.8683238985970956, + "grad_norm": 0.8585328459739685, + "learning_rate": 3.923208583166223e-06, + "loss": 0.632, + "step": 5061 + }, + { + "epoch": 1.868693083928132, + "grad_norm": 0.8520940542221069, + "learning_rate": 3.922808687338565e-06, + "loss": 0.6924, + "step": 5062 + }, + { + "epoch": 1.869062269259168, + "grad_norm": 0.8867542147636414, + "learning_rate": 3.922408737658047e-06, + "loss": 0.643, + "step": 5063 + }, + { + "epoch": 1.8694314545902042, + "grad_norm": 0.8439658284187317, + "learning_rate": 3.9220087341398075e-06, + "loss": 0.6551, + "step": 5064 + }, + { + "epoch": 1.8698006399212406, + "grad_norm": 0.8740468621253967, + "learning_rate": 3.921608676798985e-06, + "loss": 0.6694, + "step": 5065 + }, + { + "epoch": 1.8701698252522765, + "grad_norm": 0.85799640417099, + "learning_rate": 3.921208565650725e-06, + "loss": 0.6382, + "step": 5066 + }, + { + "epoch": 1.8705390105833128, + "grad_norm": 0.8660928606987, + "learning_rate": 3.920808400710169e-06, + "loss": 0.6244, + "step": 5067 + }, + { + "epoch": 1.870908195914349, + "grad_norm": 0.8590250611305237, + "learning_rate": 3.920408181992462e-06, + "loss": 0.6409, + "step": 5068 + }, + { + "epoch": 1.8712773812453851, + "grad_norm": 0.8571264743804932, + "learning_rate": 3.920007909512754e-06, + "loss": 0.6408, + "step": 5069 + }, + { + "epoch": 1.8716465665764215, + "grad_norm": 0.8991819024085999, + "learning_rate": 3.919607583286196e-06, + "loss": 0.6536, + "step": 5070 + }, + { + "epoch": 1.8720157519074574, + "grad_norm": 0.8701766133308411, + "learning_rate": 3.919207203327939e-06, + "loss": 0.6816, + "step": 5071 + }, + { + "epoch": 1.8723849372384938, + "grad_norm": 0.8901576399803162, + "learning_rate": 3.918806769653135e-06, + "loss": 0.6587, + "step": 5072 + }, + { + "epoch": 1.8727541225695299, + "grad_norm": 0.8675899505615234, + "learning_rate": 3.918406282276944e-06, + "loss": 0.634, + "step": 5073 + }, + { + "epoch": 1.873123307900566, + "grad_norm": 0.8970234394073486, + "learning_rate": 3.918005741214523e-06, + "loss": 0.6019, + "step": 5074 + }, + { + "epoch": 1.8734924932316024, + "grad_norm": 0.863715648651123, + "learning_rate": 3.917605146481032e-06, + "loss": 0.6184, + "step": 5075 + }, + { + "epoch": 1.8738616785626383, + "grad_norm": 0.8552356958389282, + "learning_rate": 3.917204498091633e-06, + "loss": 0.6291, + "step": 5076 + }, + { + "epoch": 1.8742308638936747, + "grad_norm": 0.8590555191040039, + "learning_rate": 3.916803796061492e-06, + "loss": 0.6296, + "step": 5077 + }, + { + "epoch": 1.8746000492247108, + "grad_norm": 0.8591761589050293, + "learning_rate": 3.916403040405775e-06, + "loss": 0.6177, + "step": 5078 + }, + { + "epoch": 1.874969234555747, + "grad_norm": 0.9096468687057495, + "learning_rate": 3.916002231139648e-06, + "loss": 0.6272, + "step": 5079 + }, + { + "epoch": 1.8753384198867833, + "grad_norm": 0.8583130836486816, + "learning_rate": 3.915601368278285e-06, + "loss": 0.6542, + "step": 5080 + }, + { + "epoch": 1.8757076052178192, + "grad_norm": 0.8641960620880127, + "learning_rate": 3.915200451836857e-06, + "loss": 0.6445, + "step": 5081 + }, + { + "epoch": 1.8760767905488556, + "grad_norm": 0.8667200803756714, + "learning_rate": 3.914799481830538e-06, + "loss": 0.6548, + "step": 5082 + }, + { + "epoch": 1.8764459758798917, + "grad_norm": 0.8726089000701904, + "learning_rate": 3.914398458274505e-06, + "loss": 0.6338, + "step": 5083 + }, + { + "epoch": 1.8768151612109278, + "grad_norm": 0.8848123550415039, + "learning_rate": 3.913997381183937e-06, + "loss": 0.6461, + "step": 5084 + }, + { + "epoch": 1.8771843465419642, + "grad_norm": 0.8517464995384216, + "learning_rate": 3.913596250574015e-06, + "loss": 0.65, + "step": 5085 + }, + { + "epoch": 1.8775535318730001, + "grad_norm": 0.8762145042419434, + "learning_rate": 3.913195066459919e-06, + "loss": 0.6257, + "step": 5086 + }, + { + "epoch": 1.8779227172040365, + "grad_norm": 0.8786299228668213, + "learning_rate": 3.912793828856838e-06, + "loss": 0.6376, + "step": 5087 + }, + { + "epoch": 1.8782919025350726, + "grad_norm": 0.8833543658256531, + "learning_rate": 3.912392537779955e-06, + "loss": 0.6252, + "step": 5088 + }, + { + "epoch": 1.8786610878661087, + "grad_norm": 0.8894073367118835, + "learning_rate": 3.911991193244461e-06, + "loss": 0.6488, + "step": 5089 + }, + { + "epoch": 1.879030273197145, + "grad_norm": 0.8546868562698364, + "learning_rate": 3.911589795265546e-06, + "loss": 0.6449, + "step": 5090 + }, + { + "epoch": 1.879399458528181, + "grad_norm": 0.863876461982727, + "learning_rate": 3.911188343858402e-06, + "loss": 0.6578, + "step": 5091 + }, + { + "epoch": 1.8797686438592174, + "grad_norm": 0.8900443911552429, + "learning_rate": 3.910786839038225e-06, + "loss": 0.6422, + "step": 5092 + }, + { + "epoch": 1.8801378291902535, + "grad_norm": 0.8807564973831177, + "learning_rate": 3.9103852808202115e-06, + "loss": 0.651, + "step": 5093 + }, + { + "epoch": 1.8805070145212897, + "grad_norm": 0.8684282302856445, + "learning_rate": 3.90998366921956e-06, + "loss": 0.6616, + "step": 5094 + }, + { + "epoch": 1.880876199852326, + "grad_norm": 0.8514790534973145, + "learning_rate": 3.909582004251472e-06, + "loss": 0.6667, + "step": 5095 + }, + { + "epoch": 1.881245385183362, + "grad_norm": 0.8838033080101013, + "learning_rate": 3.90918028593115e-06, + "loss": 0.6467, + "step": 5096 + }, + { + "epoch": 1.8816145705143983, + "grad_norm": 0.9010116457939148, + "learning_rate": 3.908778514273799e-06, + "loss": 0.6303, + "step": 5097 + }, + { + "epoch": 1.8819837558454344, + "grad_norm": 0.8862345218658447, + "learning_rate": 3.908376689294625e-06, + "loss": 0.6446, + "step": 5098 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.8646761178970337, + "learning_rate": 3.907974811008839e-06, + "loss": 0.646, + "step": 5099 + }, + { + "epoch": 1.882722126507507, + "grad_norm": 0.8843351602554321, + "learning_rate": 3.907572879431651e-06, + "loss": 0.671, + "step": 5100 + }, + { + "epoch": 1.8830913118385428, + "grad_norm": 0.9055441617965698, + "learning_rate": 3.907170894578272e-06, + "loss": 0.6305, + "step": 5101 + }, + { + "epoch": 1.8834604971695792, + "grad_norm": 0.8405061960220337, + "learning_rate": 3.90676885646392e-06, + "loss": 0.6884, + "step": 5102 + }, + { + "epoch": 1.8838296825006153, + "grad_norm": 0.8912229537963867, + "learning_rate": 3.90636676510381e-06, + "loss": 0.6432, + "step": 5103 + }, + { + "epoch": 1.8841988678316515, + "grad_norm": 0.8643870949745178, + "learning_rate": 3.9059646205131626e-06, + "loss": 0.6457, + "step": 5104 + }, + { + "epoch": 1.8845680531626878, + "grad_norm": 0.8662566542625427, + "learning_rate": 3.905562422707198e-06, + "loss": 0.643, + "step": 5105 + }, + { + "epoch": 1.8849372384937237, + "grad_norm": 0.8688254952430725, + "learning_rate": 3.905160171701139e-06, + "loss": 0.6338, + "step": 5106 + }, + { + "epoch": 1.88530642382476, + "grad_norm": 0.875701367855072, + "learning_rate": 3.90475786751021e-06, + "loss": 0.6605, + "step": 5107 + }, + { + "epoch": 1.8856756091557962, + "grad_norm": 0.8786723017692566, + "learning_rate": 3.90435551014964e-06, + "loss": 0.6688, + "step": 5108 + }, + { + "epoch": 1.8860447944868324, + "grad_norm": 0.8383857011795044, + "learning_rate": 3.903953099634658e-06, + "loss": 0.6348, + "step": 5109 + }, + { + "epoch": 1.8864139798178687, + "grad_norm": 0.8509512543678284, + "learning_rate": 3.903550635980493e-06, + "loss": 0.6602, + "step": 5110 + }, + { + "epoch": 1.8867831651489047, + "grad_norm": 0.8737908005714417, + "learning_rate": 3.9031481192023804e-06, + "loss": 0.6436, + "step": 5111 + }, + { + "epoch": 1.887152350479941, + "grad_norm": 0.8805723786354065, + "learning_rate": 3.902745549315553e-06, + "loss": 0.6501, + "step": 5112 + }, + { + "epoch": 1.8875215358109771, + "grad_norm": 0.8969019055366516, + "learning_rate": 3.9023429263352505e-06, + "loss": 0.6049, + "step": 5113 + }, + { + "epoch": 1.8878907211420133, + "grad_norm": 0.8479764461517334, + "learning_rate": 3.90194025027671e-06, + "loss": 0.6482, + "step": 5114 + }, + { + "epoch": 1.8882599064730494, + "grad_norm": 0.8753570914268494, + "learning_rate": 3.901537521155175e-06, + "loss": 0.6626, + "step": 5115 + }, + { + "epoch": 1.8886290918040856, + "grad_norm": 0.8767268061637878, + "learning_rate": 3.901134738985886e-06, + "loss": 0.6626, + "step": 5116 + }, + { + "epoch": 1.888998277135122, + "grad_norm": 0.8746569156646729, + "learning_rate": 3.900731903784088e-06, + "loss": 0.6552, + "step": 5117 + }, + { + "epoch": 1.889367462466158, + "grad_norm": 0.8696516156196594, + "learning_rate": 3.900329015565032e-06, + "loss": 0.6508, + "step": 5118 + }, + { + "epoch": 1.8897366477971942, + "grad_norm": 0.849485456943512, + "learning_rate": 3.899926074343963e-06, + "loss": 0.7004, + "step": 5119 + }, + { + "epoch": 1.8901058331282303, + "grad_norm": 0.867277204990387, + "learning_rate": 3.899523080136135e-06, + "loss": 0.6876, + "step": 5120 + }, + { + "epoch": 1.8904750184592665, + "grad_norm": 0.8665715456008911, + "learning_rate": 3.899120032956799e-06, + "loss": 0.6902, + "step": 5121 + }, + { + "epoch": 1.8908442037903028, + "grad_norm": 0.8781852126121521, + "learning_rate": 3.8987169328212125e-06, + "loss": 0.6303, + "step": 5122 + }, + { + "epoch": 1.891213389121339, + "grad_norm": 0.8699957728385925, + "learning_rate": 3.898313779744631e-06, + "loss": 0.6619, + "step": 5123 + }, + { + "epoch": 1.891582574452375, + "grad_norm": 0.8552836179733276, + "learning_rate": 3.897910573742314e-06, + "loss": 0.6342, + "step": 5124 + }, + { + "epoch": 1.8919517597834112, + "grad_norm": 0.8725170493125916, + "learning_rate": 3.8975073148295235e-06, + "loss": 0.6782, + "step": 5125 + }, + { + "epoch": 1.8923209451144474, + "grad_norm": 0.8795178532600403, + "learning_rate": 3.897104003021521e-06, + "loss": 0.6388, + "step": 5126 + }, + { + "epoch": 1.8926901304454837, + "grad_norm": 0.8377705812454224, + "learning_rate": 3.896700638333575e-06, + "loss": 0.697, + "step": 5127 + }, + { + "epoch": 1.8930593157765196, + "grad_norm": 0.874328076839447, + "learning_rate": 3.896297220780949e-06, + "loss": 0.64, + "step": 5128 + }, + { + "epoch": 1.893428501107556, + "grad_norm": 0.865559458732605, + "learning_rate": 3.895893750378914e-06, + "loss": 0.6767, + "step": 5129 + }, + { + "epoch": 1.8937976864385921, + "grad_norm": 0.8656126856803894, + "learning_rate": 3.895490227142742e-06, + "loss": 0.668, + "step": 5130 + }, + { + "epoch": 1.8941668717696283, + "grad_norm": 0.8585976958274841, + "learning_rate": 3.895086651087704e-06, + "loss": 0.6403, + "step": 5131 + }, + { + "epoch": 1.8945360571006646, + "grad_norm": 0.8529151678085327, + "learning_rate": 3.894683022229077e-06, + "loss": 0.6418, + "step": 5132 + }, + { + "epoch": 1.8949052424317006, + "grad_norm": 0.8578778505325317, + "learning_rate": 3.894279340582139e-06, + "loss": 0.6367, + "step": 5133 + }, + { + "epoch": 1.895274427762737, + "grad_norm": 0.8727679252624512, + "learning_rate": 3.893875606162167e-06, + "loss": 0.6735, + "step": 5134 + }, + { + "epoch": 1.895643613093773, + "grad_norm": 0.863482654094696, + "learning_rate": 3.893471818984443e-06, + "loss": 0.6764, + "step": 5135 + }, + { + "epoch": 1.8960127984248092, + "grad_norm": 0.8773802518844604, + "learning_rate": 3.893067979064251e-06, + "loss": 0.6536, + "step": 5136 + }, + { + "epoch": 1.8963819837558455, + "grad_norm": 0.853804349899292, + "learning_rate": 3.892664086416875e-06, + "loss": 0.6498, + "step": 5137 + }, + { + "epoch": 1.8967511690868815, + "grad_norm": 0.8852234482765198, + "learning_rate": 3.8922601410576035e-06, + "loss": 0.6219, + "step": 5138 + }, + { + "epoch": 1.8971203544179178, + "grad_norm": 0.8455061316490173, + "learning_rate": 3.891856143001725e-06, + "loss": 0.6478, + "step": 5139 + }, + { + "epoch": 1.897489539748954, + "grad_norm": 0.8726845383644104, + "learning_rate": 3.89145209226453e-06, + "loss": 0.6177, + "step": 5140 + }, + { + "epoch": 1.89785872507999, + "grad_norm": 0.872648298740387, + "learning_rate": 3.891047988861313e-06, + "loss": 0.6525, + "step": 5141 + }, + { + "epoch": 1.8982279104110265, + "grad_norm": 0.8607513308525085, + "learning_rate": 3.890643832807369e-06, + "loss": 0.6502, + "step": 5142 + }, + { + "epoch": 1.8985970957420624, + "grad_norm": 0.8876277804374695, + "learning_rate": 3.8902396241179944e-06, + "loss": 0.6623, + "step": 5143 + }, + { + "epoch": 1.8989662810730987, + "grad_norm": 0.8707872033119202, + "learning_rate": 3.889835362808489e-06, + "loss": 0.6683, + "step": 5144 + }, + { + "epoch": 1.8993354664041349, + "grad_norm": 0.843288004398346, + "learning_rate": 3.889431048894154e-06, + "loss": 0.6424, + "step": 5145 + }, + { + "epoch": 1.899704651735171, + "grad_norm": 0.8729190826416016, + "learning_rate": 3.889026682390291e-06, + "loss": 0.6446, + "step": 5146 + }, + { + "epoch": 1.9000738370662074, + "grad_norm": 0.8648591637611389, + "learning_rate": 3.888622263312208e-06, + "loss": 0.6669, + "step": 5147 + }, + { + "epoch": 1.9004430223972433, + "grad_norm": 0.8842174410820007, + "learning_rate": 3.88821779167521e-06, + "loss": 0.6272, + "step": 5148 + }, + { + "epoch": 1.9008122077282796, + "grad_norm": 0.8525534868240356, + "learning_rate": 3.8878132674946065e-06, + "loss": 0.6274, + "step": 5149 + }, + { + "epoch": 1.9011813930593158, + "grad_norm": 0.8453383445739746, + "learning_rate": 3.887408690785709e-06, + "loss": 0.6342, + "step": 5150 + }, + { + "epoch": 1.901550578390352, + "grad_norm": 0.8466452956199646, + "learning_rate": 3.8870040615638305e-06, + "loss": 0.6394, + "step": 5151 + }, + { + "epoch": 1.9019197637213883, + "grad_norm": 0.8761457204818726, + "learning_rate": 3.886599379844286e-06, + "loss": 0.6658, + "step": 5152 + }, + { + "epoch": 1.9022889490524242, + "grad_norm": 0.8493171334266663, + "learning_rate": 3.8861946456423924e-06, + "loss": 0.629, + "step": 5153 + }, + { + "epoch": 1.9026581343834605, + "grad_norm": 0.878124475479126, + "learning_rate": 3.88578985897347e-06, + "loss": 0.6609, + "step": 5154 + }, + { + "epoch": 1.9030273197144967, + "grad_norm": 0.8812121748924255, + "learning_rate": 3.885385019852838e-06, + "loss": 0.6694, + "step": 5155 + }, + { + "epoch": 1.9033965050455328, + "grad_norm": 1.0960946083068848, + "learning_rate": 3.8849801282958195e-06, + "loss": 0.6468, + "step": 5156 + }, + { + "epoch": 1.9037656903765692, + "grad_norm": 0.8738629817962646, + "learning_rate": 3.884575184317741e-06, + "loss": 0.6385, + "step": 5157 + }, + { + "epoch": 1.904134875707605, + "grad_norm": 0.8529300093650818, + "learning_rate": 3.884170187933929e-06, + "loss": 0.6114, + "step": 5158 + }, + { + "epoch": 1.9045040610386414, + "grad_norm": 0.8483361005783081, + "learning_rate": 3.883765139159712e-06, + "loss": 0.6662, + "step": 5159 + }, + { + "epoch": 1.9048732463696776, + "grad_norm": 0.8899786472320557, + "learning_rate": 3.88336003801042e-06, + "loss": 0.6408, + "step": 5160 + }, + { + "epoch": 1.9052424317007137, + "grad_norm": 0.8885873556137085, + "learning_rate": 3.882954884501389e-06, + "loss": 0.6206, + "step": 5161 + }, + { + "epoch": 1.90561161703175, + "grad_norm": 0.8312538862228394, + "learning_rate": 3.882549678647951e-06, + "loss": 0.6441, + "step": 5162 + }, + { + "epoch": 1.905980802362786, + "grad_norm": 0.8904805183410645, + "learning_rate": 3.882144420465445e-06, + "loss": 0.6349, + "step": 5163 + }, + { + "epoch": 1.9063499876938224, + "grad_norm": 0.8556720018386841, + "learning_rate": 3.881739109969208e-06, + "loss": 0.653, + "step": 5164 + }, + { + "epoch": 1.9067191730248585, + "grad_norm": 0.8680163621902466, + "learning_rate": 3.881333747174581e-06, + "loss": 0.5908, + "step": 5165 + }, + { + "epoch": 1.9070883583558946, + "grad_norm": 0.8291980624198914, + "learning_rate": 3.880928332096909e-06, + "loss": 0.6565, + "step": 5166 + }, + { + "epoch": 1.907457543686931, + "grad_norm": 0.8843839764595032, + "learning_rate": 3.880522864751536e-06, + "loss": 0.6288, + "step": 5167 + }, + { + "epoch": 1.907826729017967, + "grad_norm": 0.8815593719482422, + "learning_rate": 3.880117345153807e-06, + "loss": 0.6555, + "step": 5168 + }, + { + "epoch": 1.9081959143490033, + "grad_norm": 0.8666946887969971, + "learning_rate": 3.879711773319072e-06, + "loss": 0.6759, + "step": 5169 + }, + { + "epoch": 1.9085650996800394, + "grad_norm": 0.86658775806427, + "learning_rate": 3.879306149262682e-06, + "loss": 0.6219, + "step": 5170 + }, + { + "epoch": 1.9089342850110755, + "grad_norm": 0.8768011331558228, + "learning_rate": 3.8789004729999904e-06, + "loss": 0.6391, + "step": 5171 + }, + { + "epoch": 1.909303470342112, + "grad_norm": 0.8864206671714783, + "learning_rate": 3.8784947445463504e-06, + "loss": 0.6451, + "step": 5172 + }, + { + "epoch": 1.9096726556731478, + "grad_norm": 0.9627906680107117, + "learning_rate": 3.87808896391712e-06, + "loss": 0.6734, + "step": 5173 + }, + { + "epoch": 1.9100418410041842, + "grad_norm": 0.8498241901397705, + "learning_rate": 3.877683131127656e-06, + "loss": 0.6445, + "step": 5174 + }, + { + "epoch": 1.9104110263352203, + "grad_norm": 0.8660293221473694, + "learning_rate": 3.877277246193321e-06, + "loss": 0.6558, + "step": 5175 + }, + { + "epoch": 1.9107802116662564, + "grad_norm": 0.9413408637046814, + "learning_rate": 3.8768713091294776e-06, + "loss": 0.6674, + "step": 5176 + }, + { + "epoch": 1.9111493969972928, + "grad_norm": 0.8691025972366333, + "learning_rate": 3.876465319951489e-06, + "loss": 0.6451, + "step": 5177 + }, + { + "epoch": 1.9115185823283287, + "grad_norm": 0.8675034642219543, + "learning_rate": 3.876059278674722e-06, + "loss": 0.695, + "step": 5178 + }, + { + "epoch": 1.911887767659365, + "grad_norm": 0.8674656748771667, + "learning_rate": 3.8756531853145464e-06, + "loss": 0.6549, + "step": 5179 + }, + { + "epoch": 1.9122569529904012, + "grad_norm": 0.8748687505722046, + "learning_rate": 3.875247039886331e-06, + "loss": 0.6299, + "step": 5180 + }, + { + "epoch": 1.9126261383214374, + "grad_norm": 0.8572069406509399, + "learning_rate": 3.87484084240545e-06, + "loss": 0.6469, + "step": 5181 + }, + { + "epoch": 1.9129953236524735, + "grad_norm": 0.8752861022949219, + "learning_rate": 3.874434592887277e-06, + "loss": 0.649, + "step": 5182 + }, + { + "epoch": 1.9133645089835096, + "grad_norm": 0.8727638721466064, + "learning_rate": 3.8740282913471886e-06, + "loss": 0.6143, + "step": 5183 + }, + { + "epoch": 1.913733694314546, + "grad_norm": 0.8945254683494568, + "learning_rate": 3.873621937800562e-06, + "loss": 0.6544, + "step": 5184 + }, + { + "epoch": 1.9141028796455821, + "grad_norm": 0.9075742959976196, + "learning_rate": 3.87321553226278e-06, + "loss": 0.6341, + "step": 5185 + }, + { + "epoch": 1.9144720649766183, + "grad_norm": 0.861980676651001, + "learning_rate": 3.872809074749222e-06, + "loss": 0.6826, + "step": 5186 + }, + { + "epoch": 1.9148412503076544, + "grad_norm": 0.8706071972846985, + "learning_rate": 3.872402565275275e-06, + "loss": 0.6591, + "step": 5187 + }, + { + "epoch": 1.9152104356386905, + "grad_norm": 0.8605721592903137, + "learning_rate": 3.871996003856323e-06, + "loss": 0.6093, + "step": 5188 + }, + { + "epoch": 1.915579620969727, + "grad_norm": 0.8560497164726257, + "learning_rate": 3.871589390507756e-06, + "loss": 0.6779, + "step": 5189 + }, + { + "epoch": 1.915948806300763, + "grad_norm": 0.8877153396606445, + "learning_rate": 3.871182725244963e-06, + "loss": 0.6496, + "step": 5190 + }, + { + "epoch": 1.9163179916317992, + "grad_norm": 0.872919499874115, + "learning_rate": 3.870776008083337e-06, + "loss": 0.6392, + "step": 5191 + }, + { + "epoch": 1.9166871769628353, + "grad_norm": 0.8682100772857666, + "learning_rate": 3.870369239038272e-06, + "loss": 0.6458, + "step": 5192 + }, + { + "epoch": 1.9170563622938714, + "grad_norm": 0.8663478493690491, + "learning_rate": 3.869962418125163e-06, + "loss": 0.6527, + "step": 5193 + }, + { + "epoch": 1.9174255476249078, + "grad_norm": 0.8702707290649414, + "learning_rate": 3.869555545359409e-06, + "loss": 0.6891, + "step": 5194 + }, + { + "epoch": 1.917794732955944, + "grad_norm": 0.855040431022644, + "learning_rate": 3.86914862075641e-06, + "loss": 0.6617, + "step": 5195 + }, + { + "epoch": 1.91816391828698, + "grad_norm": 0.8726851344108582, + "learning_rate": 3.8687416443315674e-06, + "loss": 0.6605, + "step": 5196 + }, + { + "epoch": 1.9185331036180162, + "grad_norm": 0.8857707977294922, + "learning_rate": 3.868334616100286e-06, + "loss": 0.6576, + "step": 5197 + }, + { + "epoch": 1.9189022889490523, + "grad_norm": 0.8671833276748657, + "learning_rate": 3.867927536077971e-06, + "loss": 0.5977, + "step": 5198 + }, + { + "epoch": 1.9192714742800887, + "grad_norm": 0.8407081365585327, + "learning_rate": 3.867520404280031e-06, + "loss": 0.627, + "step": 5199 + }, + { + "epoch": 1.9196406596111246, + "grad_norm": 0.8633925318717957, + "learning_rate": 3.867113220721875e-06, + "loss": 0.6341, + "step": 5200 + }, + { + "epoch": 1.920009844942161, + "grad_norm": 0.8807472586631775, + "learning_rate": 3.866705985418916e-06, + "loss": 0.6531, + "step": 5201 + }, + { + "epoch": 1.9203790302731971, + "grad_norm": 0.8749665021896362, + "learning_rate": 3.8662986983865645e-06, + "loss": 0.6753, + "step": 5202 + }, + { + "epoch": 1.9207482156042333, + "grad_norm": 0.8612962365150452, + "learning_rate": 3.86589135964024e-06, + "loss": 0.6122, + "step": 5203 + }, + { + "epoch": 1.9211174009352696, + "grad_norm": 0.8547215461730957, + "learning_rate": 3.865483969195359e-06, + "loss": 0.6662, + "step": 5204 + }, + { + "epoch": 1.9214865862663055, + "grad_norm": 0.8542854189872742, + "learning_rate": 3.8650765270673405e-06, + "loss": 0.6643, + "step": 5205 + }, + { + "epoch": 1.9218557715973419, + "grad_norm": 0.8760797381401062, + "learning_rate": 3.864669033271606e-06, + "loss": 0.621, + "step": 5206 + }, + { + "epoch": 1.922224956928378, + "grad_norm": 0.8439907431602478, + "learning_rate": 3.864261487823579e-06, + "loss": 0.6576, + "step": 5207 + }, + { + "epoch": 1.9225941422594142, + "grad_norm": 0.8423620462417603, + "learning_rate": 3.863853890738685e-06, + "loss": 0.6519, + "step": 5208 + }, + { + "epoch": 1.9229633275904505, + "grad_norm": 0.8624527454376221, + "learning_rate": 3.863446242032352e-06, + "loss": 0.6433, + "step": 5209 + }, + { + "epoch": 1.9233325129214864, + "grad_norm": 0.8675430417060852, + "learning_rate": 3.86303854172001e-06, + "loss": 0.6711, + "step": 5210 + }, + { + "epoch": 1.9237016982525228, + "grad_norm": 0.8547025918960571, + "learning_rate": 3.862630789817089e-06, + "loss": 0.6351, + "step": 5211 + }, + { + "epoch": 1.924070883583559, + "grad_norm": 0.871283769607544, + "learning_rate": 3.862222986339022e-06, + "loss": 0.5964, + "step": 5212 + }, + { + "epoch": 1.924440068914595, + "grad_norm": 0.8317211866378784, + "learning_rate": 3.861815131301245e-06, + "loss": 0.6303, + "step": 5213 + }, + { + "epoch": 1.9248092542456314, + "grad_norm": 0.8795280456542969, + "learning_rate": 3.8614072247191944e-06, + "loss": 0.6546, + "step": 5214 + }, + { + "epoch": 1.9251784395766673, + "grad_norm": 0.8899900913238525, + "learning_rate": 3.860999266608312e-06, + "loss": 0.6469, + "step": 5215 + }, + { + "epoch": 1.9255476249077037, + "grad_norm": 0.8571198582649231, + "learning_rate": 3.860591256984035e-06, + "loss": 0.6619, + "step": 5216 + }, + { + "epoch": 1.9259168102387398, + "grad_norm": 0.8396755456924438, + "learning_rate": 3.8601831958618095e-06, + "loss": 0.635, + "step": 5217 + }, + { + "epoch": 1.926285995569776, + "grad_norm": 0.8526062965393066, + "learning_rate": 3.859775083257078e-06, + "loss": 0.6448, + "step": 5218 + }, + { + "epoch": 1.9266551809008123, + "grad_norm": 0.886280357837677, + "learning_rate": 3.85936691918529e-06, + "loss": 0.6752, + "step": 5219 + }, + { + "epoch": 1.9270243662318483, + "grad_norm": 0.905320405960083, + "learning_rate": 3.858958703661893e-06, + "loss": 0.6838, + "step": 5220 + }, + { + "epoch": 1.9273935515628846, + "grad_norm": 0.8574177026748657, + "learning_rate": 3.858550436702338e-06, + "loss": 0.6254, + "step": 5221 + }, + { + "epoch": 1.9277627368939207, + "grad_norm": 0.8267344236373901, + "learning_rate": 3.858142118322077e-06, + "loss": 0.6594, + "step": 5222 + }, + { + "epoch": 1.9281319222249569, + "grad_norm": 0.8510192036628723, + "learning_rate": 3.857733748536566e-06, + "loss": 0.612, + "step": 5223 + }, + { + "epoch": 1.9285011075559932, + "grad_norm": 0.8785051703453064, + "learning_rate": 3.857325327361261e-06, + "loss": 0.6261, + "step": 5224 + }, + { + "epoch": 1.9288702928870292, + "grad_norm": 0.835461437702179, + "learning_rate": 3.856916854811621e-06, + "loss": 0.6092, + "step": 5225 + }, + { + "epoch": 1.9292394782180655, + "grad_norm": 0.8531568646430969, + "learning_rate": 3.856508330903106e-06, + "loss": 0.665, + "step": 5226 + }, + { + "epoch": 1.9296086635491017, + "grad_norm": 0.8571925759315491, + "learning_rate": 3.856099755651179e-06, + "loss": 0.6464, + "step": 5227 + }, + { + "epoch": 1.9299778488801378, + "grad_norm": 0.873293399810791, + "learning_rate": 3.855691129071304e-06, + "loss": 0.6621, + "step": 5228 + }, + { + "epoch": 1.9303470342111742, + "grad_norm": 0.8844919204711914, + "learning_rate": 3.855282451178948e-06, + "loss": 0.6128, + "step": 5229 + }, + { + "epoch": 1.93071621954221, + "grad_norm": 0.8464515209197998, + "learning_rate": 3.8548737219895784e-06, + "loss": 0.672, + "step": 5230 + }, + { + "epoch": 1.9310854048732464, + "grad_norm": 0.8769994974136353, + "learning_rate": 3.854464941518666e-06, + "loss": 0.6428, + "step": 5231 + }, + { + "epoch": 1.9314545902042826, + "grad_norm": 0.862056314945221, + "learning_rate": 3.8540561097816835e-06, + "loss": 0.6647, + "step": 5232 + }, + { + "epoch": 1.9318237755353187, + "grad_norm": 0.8625699877738953, + "learning_rate": 3.853647226794105e-06, + "loss": 0.6325, + "step": 5233 + }, + { + "epoch": 1.932192960866355, + "grad_norm": 0.8642911314964294, + "learning_rate": 3.853238292571405e-06, + "loss": 0.6527, + "step": 5234 + }, + { + "epoch": 1.932562146197391, + "grad_norm": 0.8521655797958374, + "learning_rate": 3.8528293071290636e-06, + "loss": 0.6884, + "step": 5235 + }, + { + "epoch": 1.9329313315284273, + "grad_norm": 0.8746679425239563, + "learning_rate": 3.8524202704825595e-06, + "loss": 0.6316, + "step": 5236 + }, + { + "epoch": 1.9333005168594635, + "grad_norm": 0.822206437587738, + "learning_rate": 3.852011182647375e-06, + "loss": 0.6578, + "step": 5237 + }, + { + "epoch": 1.9336697021904996, + "grad_norm": 0.8371361494064331, + "learning_rate": 3.8516020436389945e-06, + "loss": 0.6371, + "step": 5238 + }, + { + "epoch": 1.934038887521536, + "grad_norm": 0.841515839099884, + "learning_rate": 3.851192853472903e-06, + "loss": 0.6107, + "step": 5239 + }, + { + "epoch": 1.9344080728525719, + "grad_norm": 0.9052674770355225, + "learning_rate": 3.850783612164588e-06, + "loss": 0.6829, + "step": 5240 + }, + { + "epoch": 1.9347772581836082, + "grad_norm": 0.8602111339569092, + "learning_rate": 3.8503743197295394e-06, + "loss": 0.6282, + "step": 5241 + }, + { + "epoch": 1.9351464435146444, + "grad_norm": 0.8421509861946106, + "learning_rate": 3.84996497618325e-06, + "loss": 0.6036, + "step": 5242 + }, + { + "epoch": 1.9355156288456805, + "grad_norm": 0.8711368441581726, + "learning_rate": 3.849555581541211e-06, + "loss": 0.6608, + "step": 5243 + }, + { + "epoch": 1.9358848141767169, + "grad_norm": 0.8424683213233948, + "learning_rate": 3.84914613581892e-06, + "loss": 0.6371, + "step": 5244 + }, + { + "epoch": 1.9362539995077528, + "grad_norm": 0.8541373610496521, + "learning_rate": 3.8487366390318736e-06, + "loss": 0.6604, + "step": 5245 + }, + { + "epoch": 1.9366231848387891, + "grad_norm": 0.8974496722221375, + "learning_rate": 3.848327091195571e-06, + "loss": 0.6558, + "step": 5246 + }, + { + "epoch": 1.9369923701698253, + "grad_norm": 0.871644139289856, + "learning_rate": 3.8479174923255135e-06, + "loss": 0.659, + "step": 5247 + }, + { + "epoch": 1.9373615555008614, + "grad_norm": 0.8556952476501465, + "learning_rate": 3.847507842437205e-06, + "loss": 0.6262, + "step": 5248 + }, + { + "epoch": 1.9377307408318976, + "grad_norm": 0.8600456714630127, + "learning_rate": 3.847098141546148e-06, + "loss": 0.6658, + "step": 5249 + }, + { + "epoch": 1.9380999261629337, + "grad_norm": 0.8791157603263855, + "learning_rate": 3.846688389667853e-06, + "loss": 0.6119, + "step": 5250 + }, + { + "epoch": 1.93846911149397, + "grad_norm": 0.8606899380683899, + "learning_rate": 3.846278586817827e-06, + "loss": 0.6451, + "step": 5251 + }, + { + "epoch": 1.9388382968250062, + "grad_norm": 0.8730818033218384, + "learning_rate": 3.845868733011582e-06, + "loss": 0.6435, + "step": 5252 + }, + { + "epoch": 1.9392074821560423, + "grad_norm": 0.8692975640296936, + "learning_rate": 3.84545882826463e-06, + "loss": 0.6553, + "step": 5253 + }, + { + "epoch": 1.9395766674870785, + "grad_norm": 0.8720088005065918, + "learning_rate": 3.845048872592485e-06, + "loss": 0.6595, + "step": 5254 + }, + { + "epoch": 1.9399458528181146, + "grad_norm": 0.8729556202888489, + "learning_rate": 3.844638866010666e-06, + "loss": 0.6676, + "step": 5255 + }, + { + "epoch": 1.940315038149151, + "grad_norm": 0.8830893635749817, + "learning_rate": 3.844228808534689e-06, + "loss": 0.6342, + "step": 5256 + }, + { + "epoch": 1.940684223480187, + "grad_norm": 0.8439931869506836, + "learning_rate": 3.843818700180077e-06, + "loss": 0.6383, + "step": 5257 + }, + { + "epoch": 1.9410534088112232, + "grad_norm": 0.8794415593147278, + "learning_rate": 3.843408540962351e-06, + "loss": 0.6401, + "step": 5258 + }, + { + "epoch": 1.9414225941422594, + "grad_norm": 0.8436523079872131, + "learning_rate": 3.842998330897035e-06, + "loss": 0.6258, + "step": 5259 + }, + { + "epoch": 1.9417917794732955, + "grad_norm": 0.8229694962501526, + "learning_rate": 3.842588069999657e-06, + "loss": 0.6115, + "step": 5260 + }, + { + "epoch": 1.9421609648043319, + "grad_norm": 0.8518651723861694, + "learning_rate": 3.842177758285744e-06, + "loss": 0.6415, + "step": 5261 + }, + { + "epoch": 1.942530150135368, + "grad_norm": 0.8777192831039429, + "learning_rate": 3.841767395770827e-06, + "loss": 0.6236, + "step": 5262 + }, + { + "epoch": 1.9428993354664041, + "grad_norm": 0.873447835445404, + "learning_rate": 3.841356982470436e-06, + "loss": 0.5969, + "step": 5263 + }, + { + "epoch": 1.9432685207974403, + "grad_norm": 0.8229160308837891, + "learning_rate": 3.840946518400107e-06, + "loss": 0.654, + "step": 5264 + }, + { + "epoch": 1.9436377061284764, + "grad_norm": 0.8471333384513855, + "learning_rate": 3.840536003575377e-06, + "loss": 0.6607, + "step": 5265 + }, + { + "epoch": 1.9440068914595128, + "grad_norm": 0.9126004576683044, + "learning_rate": 3.840125438011781e-06, + "loss": 0.6424, + "step": 5266 + }, + { + "epoch": 1.9443760767905487, + "grad_norm": 0.8914621472358704, + "learning_rate": 3.8397148217248604e-06, + "loss": 0.6529, + "step": 5267 + }, + { + "epoch": 1.944745262121585, + "grad_norm": 0.8534685373306274, + "learning_rate": 3.839304154730157e-06, + "loss": 0.645, + "step": 5268 + }, + { + "epoch": 1.9451144474526212, + "grad_norm": 0.8791607022285461, + "learning_rate": 3.838893437043213e-06, + "loss": 0.663, + "step": 5269 + }, + { + "epoch": 1.9454836327836573, + "grad_norm": 0.8650117516517639, + "learning_rate": 3.838482668679576e-06, + "loss": 0.6618, + "step": 5270 + }, + { + "epoch": 1.9458528181146937, + "grad_norm": 0.863811194896698, + "learning_rate": 3.838071849654792e-06, + "loss": 0.6477, + "step": 5271 + }, + { + "epoch": 1.9462220034457296, + "grad_norm": 0.8558376431465149, + "learning_rate": 3.837660979984411e-06, + "loss": 0.6341, + "step": 5272 + }, + { + "epoch": 1.946591188776766, + "grad_norm": 0.886162281036377, + "learning_rate": 3.837250059683983e-06, + "loss": 0.6841, + "step": 5273 + }, + { + "epoch": 1.946960374107802, + "grad_norm": 0.8633119463920593, + "learning_rate": 3.836839088769064e-06, + "loss": 0.6499, + "step": 5274 + }, + { + "epoch": 1.9473295594388382, + "grad_norm": 0.8590272665023804, + "learning_rate": 3.8364280672552065e-06, + "loss": 0.6436, + "step": 5275 + }, + { + "epoch": 1.9476987447698746, + "grad_norm": 0.8651774525642395, + "learning_rate": 3.836016995157968e-06, + "loss": 0.6359, + "step": 5276 + }, + { + "epoch": 1.9480679301009105, + "grad_norm": 0.8440167903900146, + "learning_rate": 3.835605872492909e-06, + "loss": 0.5916, + "step": 5277 + }, + { + "epoch": 1.9484371154319469, + "grad_norm": 0.8651075959205627, + "learning_rate": 3.835194699275589e-06, + "loss": 0.6466, + "step": 5278 + }, + { + "epoch": 1.948806300762983, + "grad_norm": 0.8562139868736267, + "learning_rate": 3.83478347552157e-06, + "loss": 0.6296, + "step": 5279 + }, + { + "epoch": 1.9491754860940191, + "grad_norm": 0.8719099760055542, + "learning_rate": 3.834372201246419e-06, + "loss": 0.6458, + "step": 5280 + }, + { + "epoch": 1.9495446714250555, + "grad_norm": 0.8383933305740356, + "learning_rate": 3.8339608764657e-06, + "loss": 0.6368, + "step": 5281 + }, + { + "epoch": 1.9499138567560914, + "grad_norm": 0.8877465724945068, + "learning_rate": 3.833549501194984e-06, + "loss": 0.656, + "step": 5282 + }, + { + "epoch": 1.9502830420871278, + "grad_norm": 0.8623272776603699, + "learning_rate": 3.83313807544984e-06, + "loss": 0.6268, + "step": 5283 + }, + { + "epoch": 1.950652227418164, + "grad_norm": 0.8594743609428406, + "learning_rate": 3.8327265992458405e-06, + "loss": 0.6285, + "step": 5284 + }, + { + "epoch": 1.9510214127492, + "grad_norm": 0.8438836932182312, + "learning_rate": 3.832315072598559e-06, + "loss": 0.6324, + "step": 5285 + }, + { + "epoch": 1.9513905980802364, + "grad_norm": 0.8539032936096191, + "learning_rate": 3.831903495523574e-06, + "loss": 0.6272, + "step": 5286 + }, + { + "epoch": 1.9517597834112723, + "grad_norm": 0.8242320418357849, + "learning_rate": 3.831491868036461e-06, + "loss": 0.6464, + "step": 5287 + }, + { + "epoch": 1.9521289687423087, + "grad_norm": 0.8372040390968323, + "learning_rate": 3.831080190152801e-06, + "loss": 0.6809, + "step": 5288 + }, + { + "epoch": 1.9524981540733448, + "grad_norm": 0.870080828666687, + "learning_rate": 3.830668461888176e-06, + "loss": 0.662, + "step": 5289 + }, + { + "epoch": 1.952867339404381, + "grad_norm": 0.8764650225639343, + "learning_rate": 3.830256683258171e-06, + "loss": 0.6646, + "step": 5290 + }, + { + "epoch": 1.9532365247354173, + "grad_norm": 0.8729700446128845, + "learning_rate": 3.8298448542783685e-06, + "loss": 0.6553, + "step": 5291 + }, + { + "epoch": 1.9536057100664532, + "grad_norm": 0.8826708793640137, + "learning_rate": 3.8294329749643585e-06, + "loss": 0.6607, + "step": 5292 + }, + { + "epoch": 1.9539748953974896, + "grad_norm": 0.8533758521080017, + "learning_rate": 3.82902104533173e-06, + "loss": 0.6293, + "step": 5293 + }, + { + "epoch": 1.9543440807285257, + "grad_norm": 0.8570526242256165, + "learning_rate": 3.828609065396075e-06, + "loss": 0.6785, + "step": 5294 + }, + { + "epoch": 1.9547132660595619, + "grad_norm": 0.858711302280426, + "learning_rate": 3.828197035172986e-06, + "loss": 0.6174, + "step": 5295 + }, + { + "epoch": 1.9550824513905982, + "grad_norm": 0.8520277142524719, + "learning_rate": 3.827784954678059e-06, + "loss": 0.6844, + "step": 5296 + }, + { + "epoch": 1.9554516367216341, + "grad_norm": 0.850110650062561, + "learning_rate": 3.827372823926888e-06, + "loss": 0.6421, + "step": 5297 + }, + { + "epoch": 1.9558208220526705, + "grad_norm": 0.8556386232376099, + "learning_rate": 3.826960642935077e-06, + "loss": 0.6194, + "step": 5298 + }, + { + "epoch": 1.9561900073837066, + "grad_norm": 0.9002448320388794, + "learning_rate": 3.826548411718224e-06, + "loss": 0.6362, + "step": 5299 + }, + { + "epoch": 1.9565591927147428, + "grad_norm": 0.8629332184791565, + "learning_rate": 3.826136130291932e-06, + "loss": 0.6502, + "step": 5300 + }, + { + "epoch": 1.9569283780457791, + "grad_norm": 0.8716061115264893, + "learning_rate": 3.825723798671805e-06, + "loss": 0.6627, + "step": 5301 + }, + { + "epoch": 1.957297563376815, + "grad_norm": 0.8813290596008301, + "learning_rate": 3.825311416873451e-06, + "loss": 0.5863, + "step": 5302 + }, + { + "epoch": 1.9576667487078514, + "grad_norm": 0.8662620186805725, + "learning_rate": 3.82489898491248e-06, + "loss": 0.6301, + "step": 5303 + }, + { + "epoch": 1.9580359340388875, + "grad_norm": 0.8735976219177246, + "learning_rate": 3.8244865028045e-06, + "loss": 0.6628, + "step": 5304 + }, + { + "epoch": 1.9584051193699237, + "grad_norm": 0.8475589156150818, + "learning_rate": 3.824073970565122e-06, + "loss": 0.6612, + "step": 5305 + }, + { + "epoch": 1.95877430470096, + "grad_norm": 0.8694972991943359, + "learning_rate": 3.823661388209964e-06, + "loss": 0.6021, + "step": 5306 + }, + { + "epoch": 1.959143490031996, + "grad_norm": 0.8800960779190063, + "learning_rate": 3.823248755754639e-06, + "loss": 0.6615, + "step": 5307 + }, + { + "epoch": 1.9595126753630323, + "grad_norm": 0.8860705494880676, + "learning_rate": 3.822836073214768e-06, + "loss": 0.6393, + "step": 5308 + }, + { + "epoch": 1.9598818606940684, + "grad_norm": 0.8552931547164917, + "learning_rate": 3.822423340605966e-06, + "loss": 0.6342, + "step": 5309 + }, + { + "epoch": 1.9602510460251046, + "grad_norm": 0.860863447189331, + "learning_rate": 3.822010557943861e-06, + "loss": 0.63, + "step": 5310 + }, + { + "epoch": 1.960620231356141, + "grad_norm": 0.8707046508789062, + "learning_rate": 3.821597725244072e-06, + "loss": 0.6719, + "step": 5311 + }, + { + "epoch": 1.9609894166871769, + "grad_norm": 0.8764771223068237, + "learning_rate": 3.821184842522226e-06, + "loss": 0.6241, + "step": 5312 + }, + { + "epoch": 1.9613586020182132, + "grad_norm": 0.8700007200241089, + "learning_rate": 3.8207719097939515e-06, + "loss": 0.6697, + "step": 5313 + }, + { + "epoch": 1.9617277873492494, + "grad_norm": 0.8548839092254639, + "learning_rate": 3.820358927074877e-06, + "loss": 0.6432, + "step": 5314 + }, + { + "epoch": 1.9620969726802855, + "grad_norm": 0.8492469787597656, + "learning_rate": 3.819945894380633e-06, + "loss": 0.585, + "step": 5315 + }, + { + "epoch": 1.9624661580113218, + "grad_norm": 0.8464144468307495, + "learning_rate": 3.819532811726855e-06, + "loss": 0.6415, + "step": 5316 + }, + { + "epoch": 1.9628353433423578, + "grad_norm": 0.8583611249923706, + "learning_rate": 3.819119679129175e-06, + "loss": 0.6421, + "step": 5317 + }, + { + "epoch": 1.9632045286733941, + "grad_norm": 0.9368652105331421, + "learning_rate": 3.8187064966032325e-06, + "loss": 0.6622, + "step": 5318 + }, + { + "epoch": 1.9635737140044303, + "grad_norm": 0.869702935218811, + "learning_rate": 3.818293264164665e-06, + "loss": 0.6442, + "step": 5319 + }, + { + "epoch": 1.9639428993354664, + "grad_norm": 0.8909294009208679, + "learning_rate": 3.8178799818291136e-06, + "loss": 0.6248, + "step": 5320 + }, + { + "epoch": 1.9643120846665025, + "grad_norm": 0.853177011013031, + "learning_rate": 3.81746664961222e-06, + "loss": 0.5993, + "step": 5321 + }, + { + "epoch": 1.9646812699975387, + "grad_norm": 0.8236253261566162, + "learning_rate": 3.8170532675296305e-06, + "loss": 0.6752, + "step": 5322 + }, + { + "epoch": 1.965050455328575, + "grad_norm": 0.8880385756492615, + "learning_rate": 3.81663983559699e-06, + "loss": 0.6921, + "step": 5323 + }, + { + "epoch": 1.9654196406596112, + "grad_norm": 0.8312002420425415, + "learning_rate": 3.8162263538299485e-06, + "loss": 0.6587, + "step": 5324 + }, + { + "epoch": 1.9657888259906473, + "grad_norm": 0.8925222754478455, + "learning_rate": 3.815812822244152e-06, + "loss": 0.6321, + "step": 5325 + }, + { + "epoch": 1.9661580113216834, + "grad_norm": 0.8339921832084656, + "learning_rate": 3.815399240855258e-06, + "loss": 0.6247, + "step": 5326 + }, + { + "epoch": 1.9665271966527196, + "grad_norm": 0.8682547807693481, + "learning_rate": 3.814985609678917e-06, + "loss": 0.6823, + "step": 5327 + }, + { + "epoch": 1.966896381983756, + "grad_norm": 0.861721396446228, + "learning_rate": 3.8145719287307862e-06, + "loss": 0.6384, + "step": 5328 + }, + { + "epoch": 1.967265567314792, + "grad_norm": 0.8265729546546936, + "learning_rate": 3.8141581980265223e-06, + "loss": 0.6426, + "step": 5329 + }, + { + "epoch": 1.9676347526458282, + "grad_norm": 0.8758171796798706, + "learning_rate": 3.813744417581786e-06, + "loss": 0.6353, + "step": 5330 + }, + { + "epoch": 1.9680039379768643, + "grad_norm": 0.8549520373344421, + "learning_rate": 3.8133305874122375e-06, + "loss": 0.6486, + "step": 5331 + }, + { + "epoch": 1.9683731233079005, + "grad_norm": 0.8729177117347717, + "learning_rate": 3.8129167075335406e-06, + "loss": 0.6362, + "step": 5332 + }, + { + "epoch": 1.9687423086389368, + "grad_norm": 0.8729832768440247, + "learning_rate": 3.812502777961361e-06, + "loss": 0.6324, + "step": 5333 + }, + { + "epoch": 1.969111493969973, + "grad_norm": 0.8402023911476135, + "learning_rate": 3.812088798711365e-06, + "loss": 0.6523, + "step": 5334 + }, + { + "epoch": 1.9694806793010091, + "grad_norm": 0.8553396463394165, + "learning_rate": 3.8116747697992223e-06, + "loss": 0.6561, + "step": 5335 + }, + { + "epoch": 1.9698498646320453, + "grad_norm": 0.8708754777908325, + "learning_rate": 3.811260691240604e-06, + "loss": 0.669, + "step": 5336 + }, + { + "epoch": 1.9702190499630814, + "grad_norm": 0.8545799255371094, + "learning_rate": 3.810846563051182e-06, + "loss": 0.6357, + "step": 5337 + }, + { + "epoch": 1.9705882352941178, + "grad_norm": 0.8874366879463196, + "learning_rate": 3.810432385246632e-06, + "loss": 0.6974, + "step": 5338 + }, + { + "epoch": 1.9709574206251537, + "grad_norm": 0.8524917364120483, + "learning_rate": 3.810018157842629e-06, + "loss": 0.6397, + "step": 5339 + }, + { + "epoch": 1.97132660595619, + "grad_norm": 0.8522272109985352, + "learning_rate": 3.809603880854852e-06, + "loss": 0.6671, + "step": 5340 + }, + { + "epoch": 1.9716957912872262, + "grad_norm": 0.8634406924247742, + "learning_rate": 3.809189554298982e-06, + "loss": 0.6535, + "step": 5341 + }, + { + "epoch": 1.9720649766182623, + "grad_norm": 0.83188396692276, + "learning_rate": 3.808775178190701e-06, + "loss": 0.6136, + "step": 5342 + }, + { + "epoch": 1.9724341619492987, + "grad_norm": 0.8376840353012085, + "learning_rate": 3.808360752545691e-06, + "loss": 0.6254, + "step": 5343 + }, + { + "epoch": 1.9728033472803346, + "grad_norm": 0.8615738749504089, + "learning_rate": 3.807946277379641e-06, + "loss": 0.6267, + "step": 5344 + }, + { + "epoch": 1.973172532611371, + "grad_norm": 0.8408340215682983, + "learning_rate": 3.8075317527082355e-06, + "loss": 0.6789, + "step": 5345 + }, + { + "epoch": 1.973541717942407, + "grad_norm": 0.8540735244750977, + "learning_rate": 3.807117178547168e-06, + "loss": 0.6457, + "step": 5346 + }, + { + "epoch": 1.9739109032734432, + "grad_norm": 0.8572787046432495, + "learning_rate": 3.8067025549121262e-06, + "loss": 0.6751, + "step": 5347 + }, + { + "epoch": 1.9742800886044796, + "grad_norm": 0.8764067888259888, + "learning_rate": 3.806287881818806e-06, + "loss": 0.6738, + "step": 5348 + }, + { + "epoch": 1.9746492739355155, + "grad_norm": 0.8872396349906921, + "learning_rate": 3.8058731592829014e-06, + "loss": 0.626, + "step": 5349 + }, + { + "epoch": 1.9750184592665518, + "grad_norm": 0.8494341373443604, + "learning_rate": 3.8054583873201096e-06, + "loss": 0.6486, + "step": 5350 + }, + { + "epoch": 1.975387644597588, + "grad_norm": 0.899255633354187, + "learning_rate": 3.80504356594613e-06, + "loss": 0.6501, + "step": 5351 + }, + { + "epoch": 1.9757568299286241, + "grad_norm": 0.8545229434967041, + "learning_rate": 3.804628695176663e-06, + "loss": 0.6424, + "step": 5352 + }, + { + "epoch": 1.9761260152596605, + "grad_norm": 0.8477988839149475, + "learning_rate": 3.804213775027412e-06, + "loss": 0.6586, + "step": 5353 + }, + { + "epoch": 1.9764952005906964, + "grad_norm": 0.8850751519203186, + "learning_rate": 3.8037988055140818e-06, + "loss": 0.6905, + "step": 5354 + }, + { + "epoch": 1.9768643859217327, + "grad_norm": 0.8931417465209961, + "learning_rate": 3.803383786652377e-06, + "loss": 0.6326, + "step": 5355 + }, + { + "epoch": 1.9772335712527689, + "grad_norm": 0.8453183770179749, + "learning_rate": 3.802968718458009e-06, + "loss": 0.6456, + "step": 5356 + }, + { + "epoch": 1.977602756583805, + "grad_norm": 0.8661874532699585, + "learning_rate": 3.8025536009466847e-06, + "loss": 0.6268, + "step": 5357 + }, + { + "epoch": 1.9779719419148414, + "grad_norm": 0.8806263208389282, + "learning_rate": 3.802138434134118e-06, + "loss": 0.6495, + "step": 5358 + }, + { + "epoch": 1.9783411272458773, + "grad_norm": 0.8934404253959656, + "learning_rate": 3.8017232180360223e-06, + "loss": 0.6344, + "step": 5359 + }, + { + "epoch": 1.9787103125769137, + "grad_norm": 0.8480618000030518, + "learning_rate": 3.8013079526681146e-06, + "loss": 0.6236, + "step": 5360 + }, + { + "epoch": 1.9790794979079498, + "grad_norm": 0.8359216451644897, + "learning_rate": 3.800892638046111e-06, + "loss": 0.6625, + "step": 5361 + }, + { + "epoch": 1.979448683238986, + "grad_norm": 0.834880530834198, + "learning_rate": 3.800477274185732e-06, + "loss": 0.6177, + "step": 5362 + }, + { + "epoch": 1.9798178685700223, + "grad_norm": 0.8574850559234619, + "learning_rate": 3.800061861102698e-06, + "loss": 0.6009, + "step": 5363 + }, + { + "epoch": 1.9801870539010582, + "grad_norm": 0.8260051012039185, + "learning_rate": 3.799646398812733e-06, + "loss": 0.6669, + "step": 5364 + }, + { + "epoch": 1.9805562392320946, + "grad_norm": 0.8773712515830994, + "learning_rate": 3.799230887331562e-06, + "loss": 0.6878, + "step": 5365 + }, + { + "epoch": 1.9809254245631307, + "grad_norm": 0.8762547969818115, + "learning_rate": 3.7988153266749123e-06, + "loss": 0.6383, + "step": 5366 + }, + { + "epoch": 1.9812946098941668, + "grad_norm": 0.885169506072998, + "learning_rate": 3.7983997168585117e-06, + "loss": 0.6344, + "step": 5367 + }, + { + "epoch": 1.9816637952252032, + "grad_norm": 0.8332593441009521, + "learning_rate": 3.7979840578980925e-06, + "loss": 0.6542, + "step": 5368 + }, + { + "epoch": 1.982032980556239, + "grad_norm": 0.877865731716156, + "learning_rate": 3.797568349809385e-06, + "loss": 0.6547, + "step": 5369 + }, + { + "epoch": 1.9824021658872755, + "grad_norm": 0.8642792701721191, + "learning_rate": 3.7971525926081267e-06, + "loss": 0.619, + "step": 5370 + }, + { + "epoch": 1.9827713512183116, + "grad_norm": 0.8647819757461548, + "learning_rate": 3.796736786310051e-06, + "loss": 0.6591, + "step": 5371 + }, + { + "epoch": 1.9831405365493477, + "grad_norm": 0.8636802434921265, + "learning_rate": 3.796320930930898e-06, + "loss": 0.6319, + "step": 5372 + }, + { + "epoch": 1.983509721880384, + "grad_norm": 0.8558893799781799, + "learning_rate": 3.7959050264864063e-06, + "loss": 0.6231, + "step": 5373 + }, + { + "epoch": 1.98387890721142, + "grad_norm": 0.8667943477630615, + "learning_rate": 3.7954890729923182e-06, + "loss": 0.6197, + "step": 5374 + }, + { + "epoch": 1.9842480925424564, + "grad_norm": 0.8311333060264587, + "learning_rate": 3.795073070464378e-06, + "loss": 0.6441, + "step": 5375 + }, + { + "epoch": 1.9846172778734925, + "grad_norm": 0.8610514998435974, + "learning_rate": 3.7946570189183307e-06, + "loss": 0.6089, + "step": 5376 + }, + { + "epoch": 1.9849864632045287, + "grad_norm": 0.8558089733123779, + "learning_rate": 3.7942409183699234e-06, + "loss": 0.6348, + "step": 5377 + }, + { + "epoch": 1.985355648535565, + "grad_norm": 0.8550251126289368, + "learning_rate": 3.793824768834906e-06, + "loss": 0.6355, + "step": 5378 + }, + { + "epoch": 1.985724833866601, + "grad_norm": 0.8760383725166321, + "learning_rate": 3.7934085703290303e-06, + "loss": 0.6546, + "step": 5379 + }, + { + "epoch": 1.9860940191976373, + "grad_norm": 0.8797130584716797, + "learning_rate": 3.7929923228680477e-06, + "loss": 0.6253, + "step": 5380 + }, + { + "epoch": 1.9864632045286734, + "grad_norm": 0.8351935744285583, + "learning_rate": 3.7925760264677138e-06, + "loss": 0.6078, + "step": 5381 + }, + { + "epoch": 1.9868323898597096, + "grad_norm": 0.8384756445884705, + "learning_rate": 3.7921596811437856e-06, + "loss": 0.604, + "step": 5382 + }, + { + "epoch": 1.987201575190746, + "grad_norm": 0.862532377243042, + "learning_rate": 3.7917432869120196e-06, + "loss": 0.6375, + "step": 5383 + }, + { + "epoch": 1.9875707605217818, + "grad_norm": 0.8722258806228638, + "learning_rate": 3.79132684378818e-06, + "loss": 0.6696, + "step": 5384 + }, + { + "epoch": 1.9879399458528182, + "grad_norm": 0.8942250609397888, + "learning_rate": 3.7909103517880256e-06, + "loss": 0.6485, + "step": 5385 + }, + { + "epoch": 1.9883091311838543, + "grad_norm": 0.849071204662323, + "learning_rate": 3.7904938109273225e-06, + "loss": 0.6305, + "step": 5386 + }, + { + "epoch": 1.9886783165148905, + "grad_norm": 0.8711240887641907, + "learning_rate": 3.7900772212218356e-06, + "loss": 0.6509, + "step": 5387 + }, + { + "epoch": 1.9890475018459266, + "grad_norm": 0.8682756423950195, + "learning_rate": 3.789660582687333e-06, + "loss": 0.6356, + "step": 5388 + }, + { + "epoch": 1.9894166871769627, + "grad_norm": 0.8574374914169312, + "learning_rate": 3.789243895339585e-06, + "loss": 0.6636, + "step": 5389 + }, + { + "epoch": 1.989785872507999, + "grad_norm": 0.8733952641487122, + "learning_rate": 3.788827159194362e-06, + "loss": 0.6759, + "step": 5390 + }, + { + "epoch": 1.9901550578390352, + "grad_norm": 0.8609288930892944, + "learning_rate": 3.7884103742674377e-06, + "loss": 0.6639, + "step": 5391 + }, + { + "epoch": 1.9905242431700714, + "grad_norm": 0.8640407919883728, + "learning_rate": 3.787993540574587e-06, + "loss": 0.6785, + "step": 5392 + }, + { + "epoch": 1.9908934285011075, + "grad_norm": 0.8812341094017029, + "learning_rate": 3.7875766581315876e-06, + "loss": 0.6321, + "step": 5393 + }, + { + "epoch": 1.9912626138321436, + "grad_norm": 0.8695360422134399, + "learning_rate": 3.7871597269542188e-06, + "loss": 0.6525, + "step": 5394 + }, + { + "epoch": 1.99163179916318, + "grad_norm": 0.863109827041626, + "learning_rate": 3.7867427470582596e-06, + "loss": 0.6451, + "step": 5395 + }, + { + "epoch": 1.9920009844942161, + "grad_norm": 0.858963131904602, + "learning_rate": 3.786325718459495e-06, + "loss": 0.6409, + "step": 5396 + }, + { + "epoch": 1.9923701698252523, + "grad_norm": 0.8365902900695801, + "learning_rate": 3.7859086411737056e-06, + "loss": 0.6177, + "step": 5397 + }, + { + "epoch": 1.9927393551562884, + "grad_norm": 0.8309932947158813, + "learning_rate": 3.7854915152166814e-06, + "loss": 0.6442, + "step": 5398 + }, + { + "epoch": 1.9931085404873246, + "grad_norm": 0.8945224285125732, + "learning_rate": 3.785074340604209e-06, + "loss": 0.6517, + "step": 5399 + }, + { + "epoch": 1.993477725818361, + "grad_norm": 0.8749533295631409, + "learning_rate": 3.7846571173520787e-06, + "loss": 0.6364, + "step": 5400 + }, + { + "epoch": 1.993846911149397, + "grad_norm": 0.8746995329856873, + "learning_rate": 3.784239845476081e-06, + "loss": 0.6507, + "step": 5401 + }, + { + "epoch": 1.9942160964804332, + "grad_norm": 0.8650192022323608, + "learning_rate": 3.7838225249920107e-06, + "loss": 0.6338, + "step": 5402 + }, + { + "epoch": 1.9945852818114693, + "grad_norm": 0.8604942560195923, + "learning_rate": 3.783405155915663e-06, + "loss": 0.67, + "step": 5403 + }, + { + "epoch": 1.9949544671425055, + "grad_norm": 0.8618472814559937, + "learning_rate": 3.7829877382628363e-06, + "loss": 0.6235, + "step": 5404 + }, + { + "epoch": 1.9953236524735418, + "grad_norm": 0.8411215543746948, + "learning_rate": 3.7825702720493275e-06, + "loss": 0.6313, + "step": 5405 + }, + { + "epoch": 1.9956928378045777, + "grad_norm": 0.8370277881622314, + "learning_rate": 3.782152757290939e-06, + "loss": 0.6484, + "step": 5406 + }, + { + "epoch": 1.996062023135614, + "grad_norm": 0.8832712173461914, + "learning_rate": 3.781735194003473e-06, + "loss": 0.6728, + "step": 5407 + }, + { + "epoch": 1.9964312084666502, + "grad_norm": 0.8636376857757568, + "learning_rate": 3.781317582202735e-06, + "loss": 0.6143, + "step": 5408 + }, + { + "epoch": 1.9968003937976864, + "grad_norm": 0.8576546907424927, + "learning_rate": 3.78089992190453e-06, + "loss": 0.6268, + "step": 5409 + }, + { + "epoch": 1.9971695791287227, + "grad_norm": 0.8420486450195312, + "learning_rate": 3.7804822131246682e-06, + "loss": 0.623, + "step": 5410 + }, + { + "epoch": 1.9975387644597586, + "grad_norm": 0.8502286672592163, + "learning_rate": 3.780064455878958e-06, + "loss": 0.6539, + "step": 5411 + }, + { + "epoch": 1.997907949790795, + "grad_norm": 0.8586933612823486, + "learning_rate": 3.7796466501832126e-06, + "loss": 0.6593, + "step": 5412 + }, + { + "epoch": 1.9982771351218311, + "grad_norm": 0.8478260636329651, + "learning_rate": 3.779228796053245e-06, + "loss": 0.6321, + "step": 5413 + }, + { + "epoch": 1.9986463204528673, + "grad_norm": 0.8597049713134766, + "learning_rate": 3.7788108935048716e-06, + "loss": 0.595, + "step": 5414 + }, + { + "epoch": 1.9990155057839036, + "grad_norm": 0.8475335240364075, + "learning_rate": 3.778392942553909e-06, + "loss": 0.6932, + "step": 5415 + }, + { + "epoch": 1.9993846911149395, + "grad_norm": 0.8577222228050232, + "learning_rate": 3.777974943216177e-06, + "loss": 0.6629, + "step": 5416 + }, + { + "epoch": 1.999753876445976, + "grad_norm": 0.8618422746658325, + "learning_rate": 3.777556895507497e-06, + "loss": 0.6523, + "step": 5417 + }, + { + "epoch": 2.0, + "grad_norm": 1.0387004613876343, + "learning_rate": 3.7771387994436924e-06, + "loss": 0.6618, + "step": 5418 + }, + { + "epoch": 2.0003691853310364, + "grad_norm": 1.0295758247375488, + "learning_rate": 3.7767206550405867e-06, + "loss": 0.6048, + "step": 5419 + }, + { + "epoch": 2.0007383706620723, + "grad_norm": 0.8333767056465149, + "learning_rate": 3.776302462314008e-06, + "loss": 0.5869, + "step": 5420 + }, + { + "epoch": 2.0011075559931086, + "grad_norm": 0.8305107355117798, + "learning_rate": 3.775884221279782e-06, + "loss": 0.618, + "step": 5421 + }, + { + "epoch": 2.0014767413241445, + "grad_norm": 0.8287608623504639, + "learning_rate": 3.7754659319537425e-06, + "loss": 0.6207, + "step": 5422 + }, + { + "epoch": 2.001845926655181, + "grad_norm": 0.831290602684021, + "learning_rate": 3.7750475943517185e-06, + "loss": 0.606, + "step": 5423 + }, + { + "epoch": 2.0022151119862173, + "grad_norm": 0.8303481340408325, + "learning_rate": 3.774629208489547e-06, + "loss": 0.6397, + "step": 5424 + }, + { + "epoch": 2.002584297317253, + "grad_norm": 0.8491837382316589, + "learning_rate": 3.7742107743830615e-06, + "loss": 0.5793, + "step": 5425 + }, + { + "epoch": 2.0029534826482895, + "grad_norm": 0.8744213581085205, + "learning_rate": 3.7737922920481002e-06, + "loss": 0.6415, + "step": 5426 + }, + { + "epoch": 2.0033226679793255, + "grad_norm": 0.8968637585639954, + "learning_rate": 3.773373761500503e-06, + "loss": 0.6209, + "step": 5427 + }, + { + "epoch": 2.003691853310362, + "grad_norm": 0.8872970938682556, + "learning_rate": 3.7729551827561107e-06, + "loss": 0.6413, + "step": 5428 + }, + { + "epoch": 2.004061038641398, + "grad_norm": 0.8702279329299927, + "learning_rate": 3.772536555830767e-06, + "loss": 0.6286, + "step": 5429 + }, + { + "epoch": 2.004430223972434, + "grad_norm": 0.8310436606407166, + "learning_rate": 3.772117880740315e-06, + "loss": 0.5949, + "step": 5430 + }, + { + "epoch": 2.0047994093034704, + "grad_norm": 0.8665198683738708, + "learning_rate": 3.7716991575006033e-06, + "loss": 0.6464, + "step": 5431 + }, + { + "epoch": 2.0051685946345064, + "grad_norm": 0.8608758449554443, + "learning_rate": 3.7712803861274806e-06, + "loss": 0.5851, + "step": 5432 + }, + { + "epoch": 2.0055377799655427, + "grad_norm": 0.8466663360595703, + "learning_rate": 3.770861566636796e-06, + "loss": 0.6442, + "step": 5433 + }, + { + "epoch": 2.005906965296579, + "grad_norm": 0.8837466835975647, + "learning_rate": 3.7704426990444022e-06, + "loss": 0.6157, + "step": 5434 + }, + { + "epoch": 2.006276150627615, + "grad_norm": 0.8762925267219543, + "learning_rate": 3.7700237833661523e-06, + "loss": 0.6372, + "step": 5435 + }, + { + "epoch": 2.0066453359586514, + "grad_norm": 0.8840300440788269, + "learning_rate": 3.7696048196179047e-06, + "loss": 0.5827, + "step": 5436 + }, + { + "epoch": 2.0070145212896873, + "grad_norm": 1.024178385734558, + "learning_rate": 3.769185807815514e-06, + "loss": 0.627, + "step": 5437 + }, + { + "epoch": 2.0073837066207236, + "grad_norm": 0.850252091884613, + "learning_rate": 3.7687667479748426e-06, + "loss": 0.6127, + "step": 5438 + }, + { + "epoch": 2.00775289195176, + "grad_norm": 0.8254269361495972, + "learning_rate": 3.7683476401117486e-06, + "loss": 0.5879, + "step": 5439 + }, + { + "epoch": 2.008122077282796, + "grad_norm": 0.8563636541366577, + "learning_rate": 3.767928484242097e-06, + "loss": 0.6087, + "step": 5440 + }, + { + "epoch": 2.0084912626138323, + "grad_norm": 0.8765075206756592, + "learning_rate": 3.7675092803817524e-06, + "loss": 0.5945, + "step": 5441 + }, + { + "epoch": 2.008860447944868, + "grad_norm": 0.8346552848815918, + "learning_rate": 3.767090028546583e-06, + "loss": 0.641, + "step": 5442 + }, + { + "epoch": 2.0092296332759045, + "grad_norm": 0.8615577220916748, + "learning_rate": 3.7666707287524545e-06, + "loss": 0.5714, + "step": 5443 + }, + { + "epoch": 2.009598818606941, + "grad_norm": 0.8758264183998108, + "learning_rate": 3.766251381015239e-06, + "loss": 0.6378, + "step": 5444 + }, + { + "epoch": 2.009968003937977, + "grad_norm": 0.885068416595459, + "learning_rate": 3.765831985350808e-06, + "loss": 0.632, + "step": 5445 + }, + { + "epoch": 2.010337189269013, + "grad_norm": 0.8851007223129272, + "learning_rate": 3.7654125417750365e-06, + "loss": 0.6281, + "step": 5446 + }, + { + "epoch": 2.010706374600049, + "grad_norm": 0.8572611212730408, + "learning_rate": 3.764993050303799e-06, + "loss": 0.6115, + "step": 5447 + }, + { + "epoch": 2.0110755599310854, + "grad_norm": 0.871554970741272, + "learning_rate": 3.7645735109529745e-06, + "loss": 0.6183, + "step": 5448 + }, + { + "epoch": 2.011444745262122, + "grad_norm": 0.8775212168693542, + "learning_rate": 3.7641539237384418e-06, + "loss": 0.6441, + "step": 5449 + }, + { + "epoch": 2.0118139305931577, + "grad_norm": 0.8792772889137268, + "learning_rate": 3.7637342886760813e-06, + "loss": 0.6397, + "step": 5450 + }, + { + "epoch": 2.012183115924194, + "grad_norm": 0.8603711724281311, + "learning_rate": 3.763314605781777e-06, + "loss": 0.6183, + "step": 5451 + }, + { + "epoch": 2.01255230125523, + "grad_norm": 0.8884883522987366, + "learning_rate": 3.7628948750714143e-06, + "loss": 0.6389, + "step": 5452 + }, + { + "epoch": 2.0129214865862664, + "grad_norm": 0.8615447878837585, + "learning_rate": 3.762475096560878e-06, + "loss": 0.6309, + "step": 5453 + }, + { + "epoch": 2.0132906719173023, + "grad_norm": 0.8597072958946228, + "learning_rate": 3.7620552702660582e-06, + "loss": 0.6203, + "step": 5454 + }, + { + "epoch": 2.0136598572483386, + "grad_norm": 0.8705599904060364, + "learning_rate": 3.761635396202845e-06, + "loss": 0.6244, + "step": 5455 + }, + { + "epoch": 2.014029042579375, + "grad_norm": 0.846267819404602, + "learning_rate": 3.761215474387131e-06, + "loss": 0.6106, + "step": 5456 + }, + { + "epoch": 2.014398227910411, + "grad_norm": 0.8633518218994141, + "learning_rate": 3.7607955048348076e-06, + "loss": 0.6551, + "step": 5457 + }, + { + "epoch": 2.0147674132414473, + "grad_norm": 0.8638667464256287, + "learning_rate": 3.7603754875617736e-06, + "loss": 0.6202, + "step": 5458 + }, + { + "epoch": 2.015136598572483, + "grad_norm": 0.8897606730461121, + "learning_rate": 3.7599554225839245e-06, + "loss": 0.618, + "step": 5459 + }, + { + "epoch": 2.0155057839035195, + "grad_norm": 0.9317191243171692, + "learning_rate": 3.7595353099171603e-06, + "loss": 0.6451, + "step": 5460 + }, + { + "epoch": 2.015874969234556, + "grad_norm": 0.8930243253707886, + "learning_rate": 3.7591151495773825e-06, + "loss": 0.6465, + "step": 5461 + }, + { + "epoch": 2.016244154565592, + "grad_norm": 0.8794429302215576, + "learning_rate": 3.758694941580494e-06, + "loss": 0.6354, + "step": 5462 + }, + { + "epoch": 2.016613339896628, + "grad_norm": 0.8783844113349915, + "learning_rate": 3.758274685942398e-06, + "loss": 0.626, + "step": 5463 + }, + { + "epoch": 2.016982525227664, + "grad_norm": 0.8416608572006226, + "learning_rate": 3.757854382679003e-06, + "loss": 0.6237, + "step": 5464 + }, + { + "epoch": 2.0173517105587004, + "grad_norm": 0.8936009407043457, + "learning_rate": 3.7574340318062162e-06, + "loss": 0.6554, + "step": 5465 + }, + { + "epoch": 2.017720895889737, + "grad_norm": 0.8711419105529785, + "learning_rate": 3.7570136333399495e-06, + "loss": 0.6168, + "step": 5466 + }, + { + "epoch": 2.0180900812207727, + "grad_norm": 0.8863180875778198, + "learning_rate": 3.756593187296112e-06, + "loss": 0.6438, + "step": 5467 + }, + { + "epoch": 2.018459266551809, + "grad_norm": 0.8442468047142029, + "learning_rate": 3.7561726936906195e-06, + "loss": 0.6142, + "step": 5468 + }, + { + "epoch": 2.018828451882845, + "grad_norm": 0.8627871870994568, + "learning_rate": 3.755752152539387e-06, + "loss": 0.6179, + "step": 5469 + }, + { + "epoch": 2.0191976372138813, + "grad_norm": 0.8819690942764282, + "learning_rate": 3.755331563858332e-06, + "loss": 0.6133, + "step": 5470 + }, + { + "epoch": 2.0195668225449177, + "grad_norm": 0.8856140375137329, + "learning_rate": 3.754910927663373e-06, + "loss": 0.6392, + "step": 5471 + }, + { + "epoch": 2.0199360078759536, + "grad_norm": 0.897413969039917, + "learning_rate": 3.754490243970432e-06, + "loss": 0.6206, + "step": 5472 + }, + { + "epoch": 2.02030519320699, + "grad_norm": 0.8817588686943054, + "learning_rate": 3.7540695127954306e-06, + "loss": 0.6063, + "step": 5473 + }, + { + "epoch": 2.020674378538026, + "grad_norm": 0.8708905577659607, + "learning_rate": 3.753648734154294e-06, + "loss": 0.6363, + "step": 5474 + }, + { + "epoch": 2.0210435638690623, + "grad_norm": 0.9195183515548706, + "learning_rate": 3.7532279080629493e-06, + "loss": 0.6395, + "step": 5475 + }, + { + "epoch": 2.0214127492000986, + "grad_norm": 0.8696544170379639, + "learning_rate": 3.7528070345373236e-06, + "loss": 0.6353, + "step": 5476 + }, + { + "epoch": 2.0217819345311345, + "grad_norm": 0.9236888885498047, + "learning_rate": 3.752386113593346e-06, + "loss": 0.6401, + "step": 5477 + }, + { + "epoch": 2.022151119862171, + "grad_norm": 0.8806305527687073, + "learning_rate": 3.75196514524695e-06, + "loss": 0.627, + "step": 5478 + }, + { + "epoch": 2.022520305193207, + "grad_norm": 0.9154170155525208, + "learning_rate": 3.751544129514069e-06, + "loss": 0.6473, + "step": 5479 + }, + { + "epoch": 2.022889490524243, + "grad_norm": 0.8469644784927368, + "learning_rate": 3.751123066410637e-06, + "loss": 0.5945, + "step": 5480 + }, + { + "epoch": 2.0232586758552795, + "grad_norm": 0.8675203919410706, + "learning_rate": 3.7507019559525914e-06, + "loss": 0.6529, + "step": 5481 + }, + { + "epoch": 2.0236278611863154, + "grad_norm": 0.8653020262718201, + "learning_rate": 3.7502807981558713e-06, + "loss": 0.5952, + "step": 5482 + }, + { + "epoch": 2.023997046517352, + "grad_norm": 0.9021411538124084, + "learning_rate": 3.749859593036419e-06, + "loss": 0.6204, + "step": 5483 + }, + { + "epoch": 2.0243662318483877, + "grad_norm": 0.8743168711662292, + "learning_rate": 3.7494383406101744e-06, + "loss": 0.6309, + "step": 5484 + }, + { + "epoch": 2.024735417179424, + "grad_norm": 0.8826589584350586, + "learning_rate": 3.749017040893083e-06, + "loss": 0.6485, + "step": 5485 + }, + { + "epoch": 2.0251046025104604, + "grad_norm": 0.8869401812553406, + "learning_rate": 3.7485956939010914e-06, + "loss": 0.5748, + "step": 5486 + }, + { + "epoch": 2.0254737878414963, + "grad_norm": 0.8839751482009888, + "learning_rate": 3.7481742996501465e-06, + "loss": 0.6341, + "step": 5487 + }, + { + "epoch": 2.0258429731725327, + "grad_norm": 0.9000351428985596, + "learning_rate": 3.747752858156198e-06, + "loss": 0.6239, + "step": 5488 + }, + { + "epoch": 2.0262121585035686, + "grad_norm": 0.9394230246543884, + "learning_rate": 3.747331369435198e-06, + "loss": 0.6366, + "step": 5489 + }, + { + "epoch": 2.026581343834605, + "grad_norm": 0.884376049041748, + "learning_rate": 3.7469098335030995e-06, + "loss": 0.6243, + "step": 5490 + }, + { + "epoch": 2.0269505291656413, + "grad_norm": 0.8773068785667419, + "learning_rate": 3.746488250375857e-06, + "loss": 0.6121, + "step": 5491 + }, + { + "epoch": 2.0273197144966772, + "grad_norm": 0.8872068524360657, + "learning_rate": 3.7460666200694275e-06, + "loss": 0.5996, + "step": 5492 + }, + { + "epoch": 2.0276888998277136, + "grad_norm": 0.8740941882133484, + "learning_rate": 3.7456449425997704e-06, + "loss": 0.6217, + "step": 5493 + }, + { + "epoch": 2.0280580851587495, + "grad_norm": 0.8861613869667053, + "learning_rate": 3.7452232179828445e-06, + "loss": 0.604, + "step": 5494 + }, + { + "epoch": 2.028427270489786, + "grad_norm": 0.8394767045974731, + "learning_rate": 3.7448014462346125e-06, + "loss": 0.5831, + "step": 5495 + }, + { + "epoch": 2.0287964558208222, + "grad_norm": 0.8913205862045288, + "learning_rate": 3.7443796273710402e-06, + "loss": 0.6012, + "step": 5496 + }, + { + "epoch": 2.029165641151858, + "grad_norm": 0.9268904328346252, + "learning_rate": 3.7439577614080908e-06, + "loss": 0.5905, + "step": 5497 + }, + { + "epoch": 2.0295348264828945, + "grad_norm": 0.8484210968017578, + "learning_rate": 3.7435358483617325e-06, + "loss": 0.6298, + "step": 5498 + }, + { + "epoch": 2.0299040118139304, + "grad_norm": 0.8972217440605164, + "learning_rate": 3.7431138882479356e-06, + "loss": 0.6055, + "step": 5499 + }, + { + "epoch": 2.030273197144967, + "grad_norm": 0.9165451526641846, + "learning_rate": 3.7426918810826694e-06, + "loss": 0.6063, + "step": 5500 + }, + { + "epoch": 2.030642382476003, + "grad_norm": 0.8530952334403992, + "learning_rate": 3.742269826881908e-06, + "loss": 0.6111, + "step": 5501 + }, + { + "epoch": 2.031011567807039, + "grad_norm": 0.8594837784767151, + "learning_rate": 3.741847725661626e-06, + "loss": 0.6093, + "step": 5502 + }, + { + "epoch": 2.0313807531380754, + "grad_norm": 0.8457330465316772, + "learning_rate": 3.7414255774377994e-06, + "loss": 0.6101, + "step": 5503 + }, + { + "epoch": 2.0317499384691113, + "grad_norm": 0.8666635751724243, + "learning_rate": 3.7410033822264064e-06, + "loss": 0.5724, + "step": 5504 + }, + { + "epoch": 2.0321191238001477, + "grad_norm": 0.8561533093452454, + "learning_rate": 3.7405811400434265e-06, + "loss": 0.6282, + "step": 5505 + }, + { + "epoch": 2.032488309131184, + "grad_norm": 0.8772096633911133, + "learning_rate": 3.7401588509048427e-06, + "loss": 0.6251, + "step": 5506 + }, + { + "epoch": 2.03285749446222, + "grad_norm": 0.8493826985359192, + "learning_rate": 3.7397365148266377e-06, + "loss": 0.6478, + "step": 5507 + }, + { + "epoch": 2.0332266797932563, + "grad_norm": 0.881813108921051, + "learning_rate": 3.739314131824797e-06, + "loss": 0.6244, + "step": 5508 + }, + { + "epoch": 2.0335958651242922, + "grad_norm": 0.8786026239395142, + "learning_rate": 3.7388917019153082e-06, + "loss": 0.6421, + "step": 5509 + }, + { + "epoch": 2.0339650504553286, + "grad_norm": 0.8880099058151245, + "learning_rate": 3.738469225114158e-06, + "loss": 0.6304, + "step": 5510 + }, + { + "epoch": 2.034334235786365, + "grad_norm": 0.8453584909439087, + "learning_rate": 3.738046701437339e-06, + "loss": 0.6389, + "step": 5511 + }, + { + "epoch": 2.034703421117401, + "grad_norm": 0.8454585075378418, + "learning_rate": 3.7376241309008433e-06, + "loss": 0.5997, + "step": 5512 + }, + { + "epoch": 2.0350726064484372, + "grad_norm": 0.8683871030807495, + "learning_rate": 3.737201513520665e-06, + "loss": 0.6119, + "step": 5513 + }, + { + "epoch": 2.035441791779473, + "grad_norm": 0.881239116191864, + "learning_rate": 3.7367788493127997e-06, + "loss": 0.6417, + "step": 5514 + }, + { + "epoch": 2.0358109771105095, + "grad_norm": 0.8690807223320007, + "learning_rate": 3.736356138293245e-06, + "loss": 0.6348, + "step": 5515 + }, + { + "epoch": 2.036180162441546, + "grad_norm": 0.8798560500144958, + "learning_rate": 3.7359333804780007e-06, + "loss": 0.6383, + "step": 5516 + }, + { + "epoch": 2.036549347772582, + "grad_norm": 0.8783085942268372, + "learning_rate": 3.735510575883069e-06, + "loss": 0.6018, + "step": 5517 + }, + { + "epoch": 2.036918533103618, + "grad_norm": 0.880907416343689, + "learning_rate": 3.735087724524451e-06, + "loss": 0.6194, + "step": 5518 + }, + { + "epoch": 2.037287718434654, + "grad_norm": 0.8544676899909973, + "learning_rate": 3.7346648264181523e-06, + "loss": 0.6483, + "step": 5519 + }, + { + "epoch": 2.0376569037656904, + "grad_norm": 0.8617678284645081, + "learning_rate": 3.7342418815801794e-06, + "loss": 0.6334, + "step": 5520 + }, + { + "epoch": 2.038026089096727, + "grad_norm": 0.883032500743866, + "learning_rate": 3.733818890026542e-06, + "loss": 0.5951, + "step": 5521 + }, + { + "epoch": 2.0383952744277627, + "grad_norm": 0.8915764093399048, + "learning_rate": 3.7333958517732483e-06, + "loss": 0.5999, + "step": 5522 + }, + { + "epoch": 2.038764459758799, + "grad_norm": 0.8715211749076843, + "learning_rate": 3.7329727668363114e-06, + "loss": 0.6109, + "step": 5523 + }, + { + "epoch": 2.039133645089835, + "grad_norm": 0.8502455949783325, + "learning_rate": 3.7325496352317436e-06, + "loss": 0.6094, + "step": 5524 + }, + { + "epoch": 2.0395028304208713, + "grad_norm": 0.8553181886672974, + "learning_rate": 3.732126456975561e-06, + "loss": 0.5943, + "step": 5525 + }, + { + "epoch": 2.0398720157519072, + "grad_norm": 0.871039628982544, + "learning_rate": 3.7317032320837814e-06, + "loss": 0.6129, + "step": 5526 + }, + { + "epoch": 2.0402412010829436, + "grad_norm": 0.8813468217849731, + "learning_rate": 3.7312799605724235e-06, + "loss": 0.6309, + "step": 5527 + }, + { + "epoch": 2.04061038641398, + "grad_norm": 0.8861755728721619, + "learning_rate": 3.7308566424575073e-06, + "loss": 0.6566, + "step": 5528 + }, + { + "epoch": 2.040979571745016, + "grad_norm": 0.9156109690666199, + "learning_rate": 3.730433277755055e-06, + "loss": 0.6135, + "step": 5529 + }, + { + "epoch": 2.0413487570760522, + "grad_norm": 0.8852343559265137, + "learning_rate": 3.7300098664810926e-06, + "loss": 0.6077, + "step": 5530 + }, + { + "epoch": 2.041717942407088, + "grad_norm": 0.8541592955589294, + "learning_rate": 3.729586408651644e-06, + "loss": 0.6362, + "step": 5531 + }, + { + "epoch": 2.0420871277381245, + "grad_norm": 0.9045969843864441, + "learning_rate": 3.7291629042827382e-06, + "loss": 0.617, + "step": 5532 + }, + { + "epoch": 2.042456313069161, + "grad_norm": 0.8667749762535095, + "learning_rate": 3.728739353390405e-06, + "loss": 0.6013, + "step": 5533 + }, + { + "epoch": 2.042825498400197, + "grad_norm": 0.8653653860092163, + "learning_rate": 3.7283157559906745e-06, + "loss": 0.6087, + "step": 5534 + }, + { + "epoch": 2.043194683731233, + "grad_norm": 0.878309965133667, + "learning_rate": 3.7278921120995803e-06, + "loss": 0.6108, + "step": 5535 + }, + { + "epoch": 2.043563869062269, + "grad_norm": 0.8755488991737366, + "learning_rate": 3.727468421733157e-06, + "loss": 0.6511, + "step": 5536 + }, + { + "epoch": 2.0439330543933054, + "grad_norm": 0.9267759323120117, + "learning_rate": 3.727044684907442e-06, + "loss": 0.6107, + "step": 5537 + }, + { + "epoch": 2.0443022397243418, + "grad_norm": 0.8735650181770325, + "learning_rate": 3.7266209016384726e-06, + "loss": 0.6052, + "step": 5538 + }, + { + "epoch": 2.0446714250553777, + "grad_norm": 0.9051711559295654, + "learning_rate": 3.726197071942289e-06, + "loss": 0.6351, + "step": 5539 + }, + { + "epoch": 2.045040610386414, + "grad_norm": 0.8761247992515564, + "learning_rate": 3.725773195834933e-06, + "loss": 0.6601, + "step": 5540 + }, + { + "epoch": 2.04540979571745, + "grad_norm": 0.8566622138023376, + "learning_rate": 3.725349273332449e-06, + "loss": 0.6053, + "step": 5541 + }, + { + "epoch": 2.0457789810484863, + "grad_norm": 0.8945100903511047, + "learning_rate": 3.7249253044508816e-06, + "loss": 0.6315, + "step": 5542 + }, + { + "epoch": 2.0461481663795227, + "grad_norm": 0.8738051652908325, + "learning_rate": 3.7245012892062784e-06, + "loss": 0.6472, + "step": 5543 + }, + { + "epoch": 2.0465173517105586, + "grad_norm": 0.9130378365516663, + "learning_rate": 3.724077227614687e-06, + "loss": 0.6602, + "step": 5544 + }, + { + "epoch": 2.046886537041595, + "grad_norm": 0.9207741618156433, + "learning_rate": 3.7236531196921592e-06, + "loss": 0.6235, + "step": 5545 + }, + { + "epoch": 2.047255722372631, + "grad_norm": 0.8368679881095886, + "learning_rate": 3.723228965454747e-06, + "loss": 0.6277, + "step": 5546 + }, + { + "epoch": 2.0476249077036672, + "grad_norm": 0.8883042931556702, + "learning_rate": 3.7228047649185053e-06, + "loss": 0.6067, + "step": 5547 + }, + { + "epoch": 2.0479940930347036, + "grad_norm": 0.887611448764801, + "learning_rate": 3.7223805180994882e-06, + "loss": 0.6182, + "step": 5548 + }, + { + "epoch": 2.0483632783657395, + "grad_norm": 0.8882414102554321, + "learning_rate": 3.721956225013754e-06, + "loss": 0.6157, + "step": 5549 + }, + { + "epoch": 2.048732463696776, + "grad_norm": 0.8834452033042908, + "learning_rate": 3.721531885677363e-06, + "loss": 0.6208, + "step": 5550 + }, + { + "epoch": 2.0491016490278118, + "grad_norm": 0.9007229208946228, + "learning_rate": 3.721107500106376e-06, + "loss": 0.6512, + "step": 5551 + }, + { + "epoch": 2.049470834358848, + "grad_norm": 0.8620003461837769, + "learning_rate": 3.720683068316855e-06, + "loss": 0.6037, + "step": 5552 + }, + { + "epoch": 2.0498400196898845, + "grad_norm": 0.8652275800704956, + "learning_rate": 3.720258590324865e-06, + "loss": 0.6097, + "step": 5553 + }, + { + "epoch": 2.0502092050209204, + "grad_norm": 0.8462552428245544, + "learning_rate": 3.719834066146472e-06, + "loss": 0.5712, + "step": 5554 + }, + { + "epoch": 2.0505783903519568, + "grad_norm": 0.8507275581359863, + "learning_rate": 3.7194094957977455e-06, + "loss": 0.6218, + "step": 5555 + }, + { + "epoch": 2.0509475756829927, + "grad_norm": 0.8811573386192322, + "learning_rate": 3.718984879294754e-06, + "loss": 0.6097, + "step": 5556 + }, + { + "epoch": 2.051316761014029, + "grad_norm": 0.8599187731742859, + "learning_rate": 3.71856021665357e-06, + "loss": 0.6531, + "step": 5557 + }, + { + "epoch": 2.0516859463450654, + "grad_norm": 0.8790730834007263, + "learning_rate": 3.7181355078902653e-06, + "loss": 0.6208, + "step": 5558 + }, + { + "epoch": 2.0520551316761013, + "grad_norm": 0.8696463704109192, + "learning_rate": 3.7177107530209174e-06, + "loss": 0.6061, + "step": 5559 + }, + { + "epoch": 2.0524243170071377, + "grad_norm": 0.9133856296539307, + "learning_rate": 3.717285952061601e-06, + "loss": 0.6261, + "step": 5560 + }, + { + "epoch": 2.0527935023381736, + "grad_norm": 0.8808449506759644, + "learning_rate": 3.7168611050283966e-06, + "loss": 0.6385, + "step": 5561 + }, + { + "epoch": 2.05316268766921, + "grad_norm": 0.8592244386672974, + "learning_rate": 3.716436211937382e-06, + "loss": 0.5775, + "step": 5562 + }, + { + "epoch": 2.0535318730002463, + "grad_norm": 0.8764339089393616, + "learning_rate": 3.716011272804641e-06, + "loss": 0.6322, + "step": 5563 + }, + { + "epoch": 2.0539010583312822, + "grad_norm": 0.8791851997375488, + "learning_rate": 3.7155862876462573e-06, + "loss": 0.6493, + "step": 5564 + }, + { + "epoch": 2.0542702436623186, + "grad_norm": 0.9045453667640686, + "learning_rate": 3.7151612564783166e-06, + "loss": 0.6265, + "step": 5565 + }, + { + "epoch": 2.0546394289933545, + "grad_norm": 0.8869640827178955, + "learning_rate": 3.7147361793169058e-06, + "loss": 0.66, + "step": 5566 + }, + { + "epoch": 2.055008614324391, + "grad_norm": 0.8812639713287354, + "learning_rate": 3.7143110561781137e-06, + "loss": 0.6256, + "step": 5567 + }, + { + "epoch": 2.055377799655427, + "grad_norm": 0.9151013493537903, + "learning_rate": 3.713885887078031e-06, + "loss": 0.6396, + "step": 5568 + }, + { + "epoch": 2.055746984986463, + "grad_norm": 0.8732081651687622, + "learning_rate": 3.7134606720327515e-06, + "loss": 0.6395, + "step": 5569 + }, + { + "epoch": 2.0561161703174995, + "grad_norm": 0.8626176714897156, + "learning_rate": 3.713035411058368e-06, + "loss": 0.6264, + "step": 5570 + }, + { + "epoch": 2.0564853556485354, + "grad_norm": 0.8875305652618408, + "learning_rate": 3.712610104170977e-06, + "loss": 0.6249, + "step": 5571 + }, + { + "epoch": 2.0568545409795718, + "grad_norm": 0.895147979259491, + "learning_rate": 3.712184751386676e-06, + "loss": 0.628, + "step": 5572 + }, + { + "epoch": 2.057223726310608, + "grad_norm": 0.8952255249023438, + "learning_rate": 3.711759352721565e-06, + "loss": 0.6247, + "step": 5573 + }, + { + "epoch": 2.057592911641644, + "grad_norm": 0.9503923058509827, + "learning_rate": 3.711333908191745e-06, + "loss": 0.6172, + "step": 5574 + }, + { + "epoch": 2.0579620969726804, + "grad_norm": 0.8700366616249084, + "learning_rate": 3.710908417813319e-06, + "loss": 0.6316, + "step": 5575 + }, + { + "epoch": 2.0583312823037163, + "grad_norm": 0.8815407752990723, + "learning_rate": 3.710482881602391e-06, + "loss": 0.607, + "step": 5576 + }, + { + "epoch": 2.0587004676347527, + "grad_norm": 0.8732828497886658, + "learning_rate": 3.7100572995750685e-06, + "loss": 0.609, + "step": 5577 + }, + { + "epoch": 2.059069652965789, + "grad_norm": 0.8777015209197998, + "learning_rate": 3.709631671747459e-06, + "loss": 0.6296, + "step": 5578 + }, + { + "epoch": 2.059438838296825, + "grad_norm": 0.8890860080718994, + "learning_rate": 3.709205998135673e-06, + "loss": 0.6265, + "step": 5579 + }, + { + "epoch": 2.0598080236278613, + "grad_norm": 0.8560751676559448, + "learning_rate": 3.708780278755821e-06, + "loss": 0.6417, + "step": 5580 + }, + { + "epoch": 2.060177208958897, + "grad_norm": 0.8838171362876892, + "learning_rate": 3.7083545136240172e-06, + "loss": 0.6295, + "step": 5581 + }, + { + "epoch": 2.0605463942899336, + "grad_norm": 0.864031195640564, + "learning_rate": 3.707928702756376e-06, + "loss": 0.6285, + "step": 5582 + }, + { + "epoch": 2.06091557962097, + "grad_norm": 0.8914440870285034, + "learning_rate": 3.707502846169015e-06, + "loss": 0.619, + "step": 5583 + }, + { + "epoch": 2.061284764952006, + "grad_norm": 0.8774434924125671, + "learning_rate": 3.7070769438780517e-06, + "loss": 0.6081, + "step": 5584 + }, + { + "epoch": 2.061653950283042, + "grad_norm": 0.842182993888855, + "learning_rate": 3.7066509958996085e-06, + "loss": 0.5836, + "step": 5585 + }, + { + "epoch": 2.062023135614078, + "grad_norm": 0.8526553511619568, + "learning_rate": 3.706225002249805e-06, + "loss": 0.6349, + "step": 5586 + }, + { + "epoch": 2.0623923209451145, + "grad_norm": 0.8910518288612366, + "learning_rate": 3.7057989629447653e-06, + "loss": 0.6136, + "step": 5587 + }, + { + "epoch": 2.062761506276151, + "grad_norm": 0.8508710861206055, + "learning_rate": 3.705372878000616e-06, + "loss": 0.5857, + "step": 5588 + }, + { + "epoch": 2.0631306916071868, + "grad_norm": 0.8847631812095642, + "learning_rate": 3.7049467474334837e-06, + "loss": 0.6107, + "step": 5589 + }, + { + "epoch": 2.063499876938223, + "grad_norm": 0.8931329250335693, + "learning_rate": 3.7045205712594975e-06, + "loss": 0.5978, + "step": 5590 + }, + { + "epoch": 2.063869062269259, + "grad_norm": 0.8498030304908752, + "learning_rate": 3.7040943494947875e-06, + "loss": 0.6424, + "step": 5591 + }, + { + "epoch": 2.0642382476002954, + "grad_norm": 0.8528651595115662, + "learning_rate": 3.703668082155486e-06, + "loss": 0.5865, + "step": 5592 + }, + { + "epoch": 2.0646074329313313, + "grad_norm": 0.9222245216369629, + "learning_rate": 3.703241769257728e-06, + "loss": 0.6638, + "step": 5593 + }, + { + "epoch": 2.0649766182623677, + "grad_norm": 0.8781119585037231, + "learning_rate": 3.702815410817649e-06, + "loss": 0.613, + "step": 5594 + }, + { + "epoch": 2.065345803593404, + "grad_norm": 0.847076952457428, + "learning_rate": 3.7023890068513867e-06, + "loss": 0.6057, + "step": 5595 + }, + { + "epoch": 2.06571498892444, + "grad_norm": 0.8820298314094543, + "learning_rate": 3.7019625573750784e-06, + "loss": 0.5914, + "step": 5596 + }, + { + "epoch": 2.0660841742554763, + "grad_norm": 0.8805500864982605, + "learning_rate": 3.701536062404868e-06, + "loss": 0.5792, + "step": 5597 + }, + { + "epoch": 2.066453359586512, + "grad_norm": 0.8839410543441772, + "learning_rate": 3.701109521956896e-06, + "loss": 0.6114, + "step": 5598 + }, + { + "epoch": 2.0668225449175486, + "grad_norm": 0.8980732560157776, + "learning_rate": 3.7006829360473084e-06, + "loss": 0.6541, + "step": 5599 + }, + { + "epoch": 2.067191730248585, + "grad_norm": 0.8886747360229492, + "learning_rate": 3.7002563046922502e-06, + "loss": 0.6294, + "step": 5600 + }, + { + "epoch": 2.067560915579621, + "grad_norm": 0.8962683081626892, + "learning_rate": 3.69982962790787e-06, + "loss": 0.6413, + "step": 5601 + }, + { + "epoch": 2.067930100910657, + "grad_norm": 0.8530528545379639, + "learning_rate": 3.6994029057103163e-06, + "loss": 0.6138, + "step": 5602 + }, + { + "epoch": 2.068299286241693, + "grad_norm": 0.8835702538490295, + "learning_rate": 3.6989761381157425e-06, + "loss": 0.5965, + "step": 5603 + }, + { + "epoch": 2.0686684715727295, + "grad_norm": 0.8552850484848022, + "learning_rate": 3.6985493251402995e-06, + "loss": 0.6009, + "step": 5604 + }, + { + "epoch": 2.069037656903766, + "grad_norm": 0.9212649464607239, + "learning_rate": 3.6981224668001427e-06, + "loss": 0.65, + "step": 5605 + }, + { + "epoch": 2.0694068422348018, + "grad_norm": 0.908538818359375, + "learning_rate": 3.697695563111428e-06, + "loss": 0.6559, + "step": 5606 + }, + { + "epoch": 2.069776027565838, + "grad_norm": 0.8850794434547424, + "learning_rate": 3.6972686140903158e-06, + "loss": 0.6378, + "step": 5607 + }, + { + "epoch": 2.070145212896874, + "grad_norm": 0.8948014378547668, + "learning_rate": 3.6968416197529634e-06, + "loss": 0.6319, + "step": 5608 + }, + { + "epoch": 2.0705143982279104, + "grad_norm": 0.8944111466407776, + "learning_rate": 3.696414580115534e-06, + "loss": 0.6156, + "step": 5609 + }, + { + "epoch": 2.0708835835589468, + "grad_norm": 0.9224655032157898, + "learning_rate": 3.6959874951941894e-06, + "loss": 0.6404, + "step": 5610 + }, + { + "epoch": 2.0712527688899827, + "grad_norm": 0.886384904384613, + "learning_rate": 3.6955603650050965e-06, + "loss": 0.6148, + "step": 5611 + }, + { + "epoch": 2.071621954221019, + "grad_norm": 0.8899317383766174, + "learning_rate": 3.6951331895644204e-06, + "loss": 0.6406, + "step": 5612 + }, + { + "epoch": 2.071991139552055, + "grad_norm": 0.8612719774246216, + "learning_rate": 3.694705968888331e-06, + "loss": 0.6053, + "step": 5613 + }, + { + "epoch": 2.0723603248830913, + "grad_norm": 0.9052849411964417, + "learning_rate": 3.694278702992997e-06, + "loss": 0.6046, + "step": 5614 + }, + { + "epoch": 2.0727295102141277, + "grad_norm": 0.8830356597900391, + "learning_rate": 3.693851391894591e-06, + "loss": 0.6259, + "step": 5615 + }, + { + "epoch": 2.0730986955451636, + "grad_norm": 0.8550585508346558, + "learning_rate": 3.693424035609287e-06, + "loss": 0.6286, + "step": 5616 + }, + { + "epoch": 2.0734678808762, + "grad_norm": 0.8804812431335449, + "learning_rate": 3.6929966341532604e-06, + "loss": 0.617, + "step": 5617 + }, + { + "epoch": 2.073837066207236, + "grad_norm": 0.8545202016830444, + "learning_rate": 3.6925691875426865e-06, + "loss": 0.5747, + "step": 5618 + }, + { + "epoch": 2.074206251538272, + "grad_norm": 0.8751912117004395, + "learning_rate": 3.692141695793747e-06, + "loss": 0.6061, + "step": 5619 + }, + { + "epoch": 2.0745754368693086, + "grad_norm": 0.868723452091217, + "learning_rate": 3.6917141589226184e-06, + "loss": 0.5993, + "step": 5620 + }, + { + "epoch": 2.0749446222003445, + "grad_norm": 0.8767339587211609, + "learning_rate": 3.6912865769454863e-06, + "loss": 0.6153, + "step": 5621 + }, + { + "epoch": 2.075313807531381, + "grad_norm": 0.8752591609954834, + "learning_rate": 3.690858949878533e-06, + "loss": 0.5925, + "step": 5622 + }, + { + "epoch": 2.0756829928624168, + "grad_norm": 0.8425766825675964, + "learning_rate": 3.6904312777379445e-06, + "loss": 0.6222, + "step": 5623 + }, + { + "epoch": 2.076052178193453, + "grad_norm": 0.8707861304283142, + "learning_rate": 3.6900035605399077e-06, + "loss": 0.6434, + "step": 5624 + }, + { + "epoch": 2.0764213635244895, + "grad_norm": 0.8821775913238525, + "learning_rate": 3.6895757983006114e-06, + "loss": 0.6249, + "step": 5625 + }, + { + "epoch": 2.0767905488555254, + "grad_norm": 0.883891224861145, + "learning_rate": 3.6891479910362468e-06, + "loss": 0.6217, + "step": 5626 + }, + { + "epoch": 2.0771597341865617, + "grad_norm": 0.8900715112686157, + "learning_rate": 3.6887201387630064e-06, + "loss": 0.6185, + "step": 5627 + }, + { + "epoch": 2.0775289195175977, + "grad_norm": 0.8866127133369446, + "learning_rate": 3.6882922414970834e-06, + "loss": 0.5915, + "step": 5628 + }, + { + "epoch": 2.077898104848634, + "grad_norm": 0.8613858222961426, + "learning_rate": 3.6878642992546737e-06, + "loss": 0.618, + "step": 5629 + }, + { + "epoch": 2.0782672901796704, + "grad_norm": 0.884111762046814, + "learning_rate": 3.6874363120519757e-06, + "loss": 0.6262, + "step": 5630 + }, + { + "epoch": 2.0786364755107063, + "grad_norm": 0.8741036653518677, + "learning_rate": 3.6870082799051886e-06, + "loss": 0.598, + "step": 5631 + }, + { + "epoch": 2.0790056608417427, + "grad_norm": 0.8753082156181335, + "learning_rate": 3.686580202830512e-06, + "loss": 0.6078, + "step": 5632 + }, + { + "epoch": 2.0793748461727786, + "grad_norm": 0.9029115438461304, + "learning_rate": 3.68615208084415e-06, + "loss": 0.6125, + "step": 5633 + }, + { + "epoch": 2.079744031503815, + "grad_norm": 0.8937019109725952, + "learning_rate": 3.6857239139623048e-06, + "loss": 0.6155, + "step": 5634 + }, + { + "epoch": 2.0801132168348513, + "grad_norm": 0.8694891333580017, + "learning_rate": 3.6852957022011843e-06, + "loss": 0.5849, + "step": 5635 + }, + { + "epoch": 2.080482402165887, + "grad_norm": 0.8835029006004333, + "learning_rate": 3.6848674455769955e-06, + "loss": 0.6074, + "step": 5636 + }, + { + "epoch": 2.0808515874969236, + "grad_norm": 0.8859110474586487, + "learning_rate": 3.6844391441059485e-06, + "loss": 0.6199, + "step": 5637 + }, + { + "epoch": 2.0812207728279595, + "grad_norm": 0.8923909664154053, + "learning_rate": 3.6840107978042526e-06, + "loss": 0.6504, + "step": 5638 + }, + { + "epoch": 2.081589958158996, + "grad_norm": 0.8913730978965759, + "learning_rate": 3.6835824066881225e-06, + "loss": 0.6182, + "step": 5639 + }, + { + "epoch": 2.081959143490032, + "grad_norm": 0.9201462864875793, + "learning_rate": 3.6831539707737712e-06, + "loss": 0.5949, + "step": 5640 + }, + { + "epoch": 2.082328328821068, + "grad_norm": 0.8900795578956604, + "learning_rate": 3.682725490077417e-06, + "loss": 0.6327, + "step": 5641 + }, + { + "epoch": 2.0826975141521045, + "grad_norm": 0.8858770728111267, + "learning_rate": 3.6822969646152744e-06, + "loss": 0.6125, + "step": 5642 + }, + { + "epoch": 2.0830666994831404, + "grad_norm": 0.9208259582519531, + "learning_rate": 3.681868394403566e-06, + "loss": 0.6262, + "step": 5643 + }, + { + "epoch": 2.0834358848141767, + "grad_norm": 0.8563006520271301, + "learning_rate": 3.6814397794585112e-06, + "loss": 0.6291, + "step": 5644 + }, + { + "epoch": 2.083805070145213, + "grad_norm": 0.913036584854126, + "learning_rate": 3.681011119796334e-06, + "loss": 0.6064, + "step": 5645 + }, + { + "epoch": 2.084174255476249, + "grad_norm": 0.8927748203277588, + "learning_rate": 3.680582415433258e-06, + "loss": 0.5981, + "step": 5646 + }, + { + "epoch": 2.0845434408072854, + "grad_norm": 0.9170436859130859, + "learning_rate": 3.6801536663855116e-06, + "loss": 0.6654, + "step": 5647 + }, + { + "epoch": 2.0849126261383213, + "grad_norm": 0.901248037815094, + "learning_rate": 3.6797248726693196e-06, + "loss": 0.648, + "step": 5648 + }, + { + "epoch": 2.0852818114693576, + "grad_norm": 0.8986033797264099, + "learning_rate": 3.6792960343009147e-06, + "loss": 0.6032, + "step": 5649 + }, + { + "epoch": 2.085650996800394, + "grad_norm": 0.8695364594459534, + "learning_rate": 3.6788671512965268e-06, + "loss": 0.5997, + "step": 5650 + }, + { + "epoch": 2.08602018213143, + "grad_norm": 0.8859792351722717, + "learning_rate": 3.6784382236723896e-06, + "loss": 0.6233, + "step": 5651 + }, + { + "epoch": 2.0863893674624663, + "grad_norm": 0.8826692700386047, + "learning_rate": 3.6780092514447376e-06, + "loss": 0.6145, + "step": 5652 + }, + { + "epoch": 2.086758552793502, + "grad_norm": 0.8958509564399719, + "learning_rate": 3.677580234629806e-06, + "loss": 0.6352, + "step": 5653 + }, + { + "epoch": 2.0871277381245386, + "grad_norm": 0.8824563026428223, + "learning_rate": 3.6771511732438353e-06, + "loss": 0.6624, + "step": 5654 + }, + { + "epoch": 2.087496923455575, + "grad_norm": 0.9004684090614319, + "learning_rate": 3.6767220673030646e-06, + "loss": 0.6233, + "step": 5655 + }, + { + "epoch": 2.087866108786611, + "grad_norm": 0.8484705090522766, + "learning_rate": 3.6762929168237336e-06, + "loss": 0.5911, + "step": 5656 + }, + { + "epoch": 2.088235294117647, + "grad_norm": 0.8635362386703491, + "learning_rate": 3.675863721822088e-06, + "loss": 0.5861, + "step": 5657 + }, + { + "epoch": 2.088604479448683, + "grad_norm": 0.8606535196304321, + "learning_rate": 3.675434482314371e-06, + "loss": 0.6252, + "step": 5658 + }, + { + "epoch": 2.0889736647797195, + "grad_norm": 0.8915131092071533, + "learning_rate": 3.6750051983168305e-06, + "loss": 0.6037, + "step": 5659 + }, + { + "epoch": 2.0893428501107554, + "grad_norm": 0.8515760898590088, + "learning_rate": 3.6745758698457137e-06, + "loss": 0.6035, + "step": 5660 + }, + { + "epoch": 2.0897120354417917, + "grad_norm": 0.8888070583343506, + "learning_rate": 3.6741464969172713e-06, + "loss": 0.6148, + "step": 5661 + }, + { + "epoch": 2.090081220772828, + "grad_norm": 0.9536455273628235, + "learning_rate": 3.673717079547754e-06, + "loss": 0.6114, + "step": 5662 + }, + { + "epoch": 2.090450406103864, + "grad_norm": 0.8525761365890503, + "learning_rate": 3.673287617753416e-06, + "loss": 0.5752, + "step": 5663 + }, + { + "epoch": 2.0908195914349004, + "grad_norm": 0.8579257726669312, + "learning_rate": 3.672858111550512e-06, + "loss": 0.5979, + "step": 5664 + }, + { + "epoch": 2.0911887767659363, + "grad_norm": 0.8967726826667786, + "learning_rate": 3.6724285609552986e-06, + "loss": 0.6213, + "step": 5665 + }, + { + "epoch": 2.0915579620969726, + "grad_norm": 0.8674260973930359, + "learning_rate": 3.671998965984034e-06, + "loss": 0.6363, + "step": 5666 + }, + { + "epoch": 2.091927147428009, + "grad_norm": 0.8495336174964905, + "learning_rate": 3.6715693266529786e-06, + "loss": 0.5939, + "step": 5667 + }, + { + "epoch": 2.092296332759045, + "grad_norm": 0.8818087577819824, + "learning_rate": 3.6711396429783937e-06, + "loss": 0.6627, + "step": 5668 + }, + { + "epoch": 2.0926655180900813, + "grad_norm": 0.8691592216491699, + "learning_rate": 3.6707099149765435e-06, + "loss": 0.6145, + "step": 5669 + }, + { + "epoch": 2.093034703421117, + "grad_norm": 0.8658162355422974, + "learning_rate": 3.6702801426636914e-06, + "loss": 0.6032, + "step": 5670 + }, + { + "epoch": 2.0934038887521536, + "grad_norm": 0.8452206254005432, + "learning_rate": 3.6698503260561064e-06, + "loss": 0.5961, + "step": 5671 + }, + { + "epoch": 2.09377307408319, + "grad_norm": 0.9219603538513184, + "learning_rate": 3.669420465170055e-06, + "loss": 0.6512, + "step": 5672 + }, + { + "epoch": 2.094142259414226, + "grad_norm": 0.8894465565681458, + "learning_rate": 3.6689905600218086e-06, + "loss": 0.6421, + "step": 5673 + }, + { + "epoch": 2.094511444745262, + "grad_norm": 0.8922712802886963, + "learning_rate": 3.6685606106276382e-06, + "loss": 0.6363, + "step": 5674 + }, + { + "epoch": 2.094880630076298, + "grad_norm": 0.9307752251625061, + "learning_rate": 3.6681306170038177e-06, + "loss": 0.6489, + "step": 5675 + }, + { + "epoch": 2.0952498154073345, + "grad_norm": 0.879523754119873, + "learning_rate": 3.6677005791666217e-06, + "loss": 0.6311, + "step": 5676 + }, + { + "epoch": 2.095619000738371, + "grad_norm": 0.8996968865394592, + "learning_rate": 3.667270497132327e-06, + "loss": 0.6283, + "step": 5677 + }, + { + "epoch": 2.0959881860694067, + "grad_norm": 0.8886715173721313, + "learning_rate": 3.666840370917213e-06, + "loss": 0.61, + "step": 5678 + }, + { + "epoch": 2.096357371400443, + "grad_norm": 0.9556635618209839, + "learning_rate": 3.666410200537559e-06, + "loss": 0.6207, + "step": 5679 + }, + { + "epoch": 2.096726556731479, + "grad_norm": 0.8507434725761414, + "learning_rate": 3.6659799860096475e-06, + "loss": 0.6197, + "step": 5680 + }, + { + "epoch": 2.0970957420625154, + "grad_norm": 0.8683716058731079, + "learning_rate": 3.665549727349761e-06, + "loss": 0.6285, + "step": 5681 + }, + { + "epoch": 2.0974649273935517, + "grad_norm": 0.8913787603378296, + "learning_rate": 3.665119424574185e-06, + "loss": 0.6052, + "step": 5682 + }, + { + "epoch": 2.0978341127245876, + "grad_norm": 0.8760581016540527, + "learning_rate": 3.6646890776992074e-06, + "loss": 0.6115, + "step": 5683 + }, + { + "epoch": 2.098203298055624, + "grad_norm": 0.8781512975692749, + "learning_rate": 3.6642586867411145e-06, + "loss": 0.6475, + "step": 5684 + }, + { + "epoch": 2.09857248338666, + "grad_norm": 0.9206271171569824, + "learning_rate": 3.663828251716199e-06, + "loss": 0.6522, + "step": 5685 + }, + { + "epoch": 2.0989416687176963, + "grad_norm": 0.8892074227333069, + "learning_rate": 3.66339777264075e-06, + "loss": 0.6344, + "step": 5686 + }, + { + "epoch": 2.0993108540487326, + "grad_norm": 0.8600207567214966, + "learning_rate": 3.662967249531064e-06, + "loss": 0.6, + "step": 5687 + }, + { + "epoch": 2.0996800393797685, + "grad_norm": 0.8891721963882446, + "learning_rate": 3.6625366824034337e-06, + "loss": 0.6254, + "step": 5688 + }, + { + "epoch": 2.100049224710805, + "grad_norm": 0.8926984667778015, + "learning_rate": 3.662106071274157e-06, + "loss": 0.6381, + "step": 5689 + }, + { + "epoch": 2.100418410041841, + "grad_norm": 0.8945996165275574, + "learning_rate": 3.6616754161595325e-06, + "loss": 0.633, + "step": 5690 + }, + { + "epoch": 2.100787595372877, + "grad_norm": 0.8974658250808716, + "learning_rate": 3.6612447170758604e-06, + "loss": 0.621, + "step": 5691 + }, + { + "epoch": 2.1011567807039135, + "grad_norm": 0.8869962692260742, + "learning_rate": 3.6608139740394415e-06, + "loss": 0.6374, + "step": 5692 + }, + { + "epoch": 2.1015259660349495, + "grad_norm": 0.9165622591972351, + "learning_rate": 3.660383187066581e-06, + "loss": 0.611, + "step": 5693 + }, + { + "epoch": 2.101895151365986, + "grad_norm": 0.8746391534805298, + "learning_rate": 3.659952356173582e-06, + "loss": 0.6263, + "step": 5694 + }, + { + "epoch": 2.1022643366970217, + "grad_norm": 0.8896817564964294, + "learning_rate": 3.6595214813767537e-06, + "loss": 0.6134, + "step": 5695 + }, + { + "epoch": 2.102633522028058, + "grad_norm": 0.905481219291687, + "learning_rate": 3.6590905626924017e-06, + "loss": 0.5948, + "step": 5696 + }, + { + "epoch": 2.1030027073590944, + "grad_norm": 0.8920099139213562, + "learning_rate": 3.6586596001368392e-06, + "loss": 0.587, + "step": 5697 + }, + { + "epoch": 2.1033718926901304, + "grad_norm": 0.8745411038398743, + "learning_rate": 3.6582285937263755e-06, + "loss": 0.6079, + "step": 5698 + }, + { + "epoch": 2.1037410780211667, + "grad_norm": 0.8883611559867859, + "learning_rate": 3.657797543477326e-06, + "loss": 0.6312, + "step": 5699 + }, + { + "epoch": 2.1041102633522026, + "grad_norm": 0.9404711127281189, + "learning_rate": 3.657366449406004e-06, + "loss": 0.6308, + "step": 5700 + }, + { + "epoch": 2.104479448683239, + "grad_norm": 0.9406144022941589, + "learning_rate": 3.6569353115287277e-06, + "loss": 0.6232, + "step": 5701 + }, + { + "epoch": 2.1048486340142754, + "grad_norm": 0.8909290432929993, + "learning_rate": 3.656504129861814e-06, + "loss": 0.5699, + "step": 5702 + }, + { + "epoch": 2.1052178193453113, + "grad_norm": 0.8653925061225891, + "learning_rate": 3.6560729044215852e-06, + "loss": 0.6118, + "step": 5703 + }, + { + "epoch": 2.1055870046763476, + "grad_norm": 0.8614654541015625, + "learning_rate": 3.655641635224362e-06, + "loss": 0.6247, + "step": 5704 + }, + { + "epoch": 2.1059561900073835, + "grad_norm": 0.8645123839378357, + "learning_rate": 3.6552103222864667e-06, + "loss": 0.5978, + "step": 5705 + }, + { + "epoch": 2.10632537533842, + "grad_norm": 0.8699943423271179, + "learning_rate": 3.654778965624225e-06, + "loss": 0.6283, + "step": 5706 + }, + { + "epoch": 2.1066945606694563, + "grad_norm": 0.8710001111030579, + "learning_rate": 3.654347565253965e-06, + "loss": 0.6287, + "step": 5707 + }, + { + "epoch": 2.107063746000492, + "grad_norm": 0.9010536670684814, + "learning_rate": 3.6539161211920137e-06, + "loss": 0.6112, + "step": 5708 + }, + { + "epoch": 2.1074329313315285, + "grad_norm": 0.8644676804542542, + "learning_rate": 3.6534846334547016e-06, + "loss": 0.6185, + "step": 5709 + }, + { + "epoch": 2.1078021166625645, + "grad_norm": 0.8907213807106018, + "learning_rate": 3.6530531020583594e-06, + "loss": 0.6203, + "step": 5710 + }, + { + "epoch": 2.108171301993601, + "grad_norm": 0.8946707844734192, + "learning_rate": 3.6526215270193215e-06, + "loss": 0.6245, + "step": 5711 + }, + { + "epoch": 2.108540487324637, + "grad_norm": 0.8742067217826843, + "learning_rate": 3.6521899083539223e-06, + "loss": 0.6409, + "step": 5712 + }, + { + "epoch": 2.108909672655673, + "grad_norm": 0.8907426595687866, + "learning_rate": 3.6517582460784995e-06, + "loss": 0.6462, + "step": 5713 + }, + { + "epoch": 2.1092788579867094, + "grad_norm": 0.9089739322662354, + "learning_rate": 3.65132654020939e-06, + "loss": 0.6459, + "step": 5714 + }, + { + "epoch": 2.1096480433177454, + "grad_norm": 0.883345901966095, + "learning_rate": 3.6508947907629344e-06, + "loss": 0.6408, + "step": 5715 + }, + { + "epoch": 2.1100172286487817, + "grad_norm": 0.9117772579193115, + "learning_rate": 3.650462997755474e-06, + "loss": 0.6374, + "step": 5716 + }, + { + "epoch": 2.110386413979818, + "grad_norm": 0.8555970191955566, + "learning_rate": 3.6500311612033528e-06, + "loss": 0.6592, + "step": 5717 + }, + { + "epoch": 2.110755599310854, + "grad_norm": 0.8554747700691223, + "learning_rate": 3.6495992811229146e-06, + "loss": 0.5804, + "step": 5718 + }, + { + "epoch": 2.1111247846418904, + "grad_norm": 0.8642599582672119, + "learning_rate": 3.649167357530507e-06, + "loss": 0.6333, + "step": 5719 + }, + { + "epoch": 2.1114939699729263, + "grad_norm": 0.8957363367080688, + "learning_rate": 3.648735390442477e-06, + "loss": 0.6647, + "step": 5720 + }, + { + "epoch": 2.1118631553039626, + "grad_norm": 0.9073878526687622, + "learning_rate": 3.6483033798751756e-06, + "loss": 0.6133, + "step": 5721 + }, + { + "epoch": 2.112232340634999, + "grad_norm": 0.8894394636154175, + "learning_rate": 3.6478713258449525e-06, + "loss": 0.5748, + "step": 5722 + }, + { + "epoch": 2.112601525966035, + "grad_norm": 0.8513848781585693, + "learning_rate": 3.6474392283681637e-06, + "loss": 0.5918, + "step": 5723 + }, + { + "epoch": 2.1129707112970713, + "grad_norm": 0.874152421951294, + "learning_rate": 3.6470070874611607e-06, + "loss": 0.6211, + "step": 5724 + }, + { + "epoch": 2.113339896628107, + "grad_norm": 0.8650266528129578, + "learning_rate": 3.6465749031403018e-06, + "loss": 0.6389, + "step": 5725 + }, + { + "epoch": 2.1137090819591435, + "grad_norm": 0.9126646518707275, + "learning_rate": 3.6461426754219453e-06, + "loss": 0.633, + "step": 5726 + }, + { + "epoch": 2.1140782672901794, + "grad_norm": 0.8872065544128418, + "learning_rate": 3.64571040432245e-06, + "loss": 0.6548, + "step": 5727 + }, + { + "epoch": 2.114447452621216, + "grad_norm": 0.9129329323768616, + "learning_rate": 3.6452780898581775e-06, + "loss": 0.6398, + "step": 5728 + }, + { + "epoch": 2.114816637952252, + "grad_norm": 0.887728214263916, + "learning_rate": 3.6448457320454904e-06, + "loss": 0.6426, + "step": 5729 + }, + { + "epoch": 2.115185823283288, + "grad_norm": 0.9006112813949585, + "learning_rate": 3.6444133309007544e-06, + "loss": 0.6346, + "step": 5730 + }, + { + "epoch": 2.1155550086143244, + "grad_norm": 0.8866620063781738, + "learning_rate": 3.6439808864403344e-06, + "loss": 0.6508, + "step": 5731 + }, + { + "epoch": 2.115924193945361, + "grad_norm": 0.8912476301193237, + "learning_rate": 3.6435483986805985e-06, + "loss": 0.6192, + "step": 5732 + }, + { + "epoch": 2.1162933792763967, + "grad_norm": 0.8940373659133911, + "learning_rate": 3.6431158676379174e-06, + "loss": 0.66, + "step": 5733 + }, + { + "epoch": 2.116662564607433, + "grad_norm": 0.8899433612823486, + "learning_rate": 3.6426832933286616e-06, + "loss": 0.6006, + "step": 5734 + }, + { + "epoch": 2.117031749938469, + "grad_norm": 0.8601537942886353, + "learning_rate": 3.6422506757692035e-06, + "loss": 0.6368, + "step": 5735 + }, + { + "epoch": 2.1174009352695053, + "grad_norm": 0.8928695321083069, + "learning_rate": 3.6418180149759174e-06, + "loss": 0.6201, + "step": 5736 + }, + { + "epoch": 2.1177701206005413, + "grad_norm": 0.861672043800354, + "learning_rate": 3.6413853109651803e-06, + "loss": 0.5757, + "step": 5737 + }, + { + "epoch": 2.1181393059315776, + "grad_norm": 0.9174507260322571, + "learning_rate": 3.6409525637533692e-06, + "loss": 0.6168, + "step": 5738 + }, + { + "epoch": 2.118508491262614, + "grad_norm": 0.9072368144989014, + "learning_rate": 3.640519773356863e-06, + "loss": 0.6372, + "step": 5739 + }, + { + "epoch": 2.11887767659365, + "grad_norm": 0.8939194083213806, + "learning_rate": 3.6400869397920447e-06, + "loss": 0.6359, + "step": 5740 + }, + { + "epoch": 2.1192468619246863, + "grad_norm": 0.9079878926277161, + "learning_rate": 3.639654063075295e-06, + "loss": 0.6359, + "step": 5741 + }, + { + "epoch": 2.119616047255722, + "grad_norm": 0.9029211401939392, + "learning_rate": 3.6392211432229983e-06, + "loss": 0.6166, + "step": 5742 + }, + { + "epoch": 2.1199852325867585, + "grad_norm": 0.8675274848937988, + "learning_rate": 3.6387881802515414e-06, + "loss": 0.6149, + "step": 5743 + }, + { + "epoch": 2.120354417917795, + "grad_norm": 0.8810080289840698, + "learning_rate": 3.638355174177311e-06, + "loss": 0.6248, + "step": 5744 + }, + { + "epoch": 2.120723603248831, + "grad_norm": 0.8775829076766968, + "learning_rate": 3.6379221250166963e-06, + "loss": 0.5988, + "step": 5745 + }, + { + "epoch": 2.121092788579867, + "grad_norm": 0.8499928712844849, + "learning_rate": 3.6374890327860884e-06, + "loss": 0.5893, + "step": 5746 + }, + { + "epoch": 2.121461973910903, + "grad_norm": 0.8810544013977051, + "learning_rate": 3.63705589750188e-06, + "loss": 0.6443, + "step": 5747 + }, + { + "epoch": 2.1218311592419394, + "grad_norm": 0.8759510517120361, + "learning_rate": 3.636622719180465e-06, + "loss": 0.6406, + "step": 5748 + }, + { + "epoch": 2.122200344572976, + "grad_norm": 0.8930089473724365, + "learning_rate": 3.6361894978382377e-06, + "loss": 0.6078, + "step": 5749 + }, + { + "epoch": 2.1225695299040117, + "grad_norm": 0.9153942465782166, + "learning_rate": 3.635756233491597e-06, + "loss": 0.6177, + "step": 5750 + }, + { + "epoch": 2.122938715235048, + "grad_norm": 0.8819100260734558, + "learning_rate": 3.6353229261569418e-06, + "loss": 0.6126, + "step": 5751 + }, + { + "epoch": 2.123307900566084, + "grad_norm": 0.8859001398086548, + "learning_rate": 3.6348895758506715e-06, + "loss": 0.6121, + "step": 5752 + }, + { + "epoch": 2.1236770858971203, + "grad_norm": 0.9136189222335815, + "learning_rate": 3.6344561825891887e-06, + "loss": 0.6418, + "step": 5753 + }, + { + "epoch": 2.1240462712281567, + "grad_norm": 0.8824828863143921, + "learning_rate": 3.6340227463888986e-06, + "loss": 0.6345, + "step": 5754 + }, + { + "epoch": 2.1244154565591926, + "grad_norm": 0.8912015557289124, + "learning_rate": 3.633589267266204e-06, + "loss": 0.6191, + "step": 5755 + }, + { + "epoch": 2.124784641890229, + "grad_norm": 0.8655781149864197, + "learning_rate": 3.6331557452375145e-06, + "loss": 0.6051, + "step": 5756 + }, + { + "epoch": 2.125153827221265, + "grad_norm": 0.8800578117370605, + "learning_rate": 3.632722180319238e-06, + "loss": 0.6321, + "step": 5757 + }, + { + "epoch": 2.1255230125523012, + "grad_norm": 0.9078778624534607, + "learning_rate": 3.6322885725277832e-06, + "loss": 0.6052, + "step": 5758 + }, + { + "epoch": 2.1258921978833376, + "grad_norm": 0.8866066932678223, + "learning_rate": 3.631854921879564e-06, + "loss": 0.6545, + "step": 5759 + }, + { + "epoch": 2.1262613832143735, + "grad_norm": 0.8639248013496399, + "learning_rate": 3.6314212283909934e-06, + "loss": 0.5918, + "step": 5760 + }, + { + "epoch": 2.12663056854541, + "grad_norm": 0.9113978147506714, + "learning_rate": 3.630987492078486e-06, + "loss": 0.6443, + "step": 5761 + }, + { + "epoch": 2.126999753876446, + "grad_norm": 0.8464299440383911, + "learning_rate": 3.630553712958459e-06, + "loss": 0.6063, + "step": 5762 + }, + { + "epoch": 2.127368939207482, + "grad_norm": 0.8738041520118713, + "learning_rate": 3.6301198910473305e-06, + "loss": 0.5995, + "step": 5763 + }, + { + "epoch": 2.1277381245385185, + "grad_norm": 0.872807502746582, + "learning_rate": 3.629686026361522e-06, + "loss": 0.6126, + "step": 5764 + }, + { + "epoch": 2.1281073098695544, + "grad_norm": 0.8655477166175842, + "learning_rate": 3.629252118917453e-06, + "loss": 0.6091, + "step": 5765 + }, + { + "epoch": 2.128476495200591, + "grad_norm": 0.9030638337135315, + "learning_rate": 3.628818168731547e-06, + "loss": 0.6391, + "step": 5766 + }, + { + "epoch": 2.1288456805316267, + "grad_norm": 0.9224795699119568, + "learning_rate": 3.6283841758202304e-06, + "loss": 0.6388, + "step": 5767 + }, + { + "epoch": 2.129214865862663, + "grad_norm": 0.863410234451294, + "learning_rate": 3.6279501401999296e-06, + "loss": 0.6099, + "step": 5768 + }, + { + "epoch": 2.1295840511936994, + "grad_norm": 0.9004232883453369, + "learning_rate": 3.6275160618870706e-06, + "loss": 0.6373, + "step": 5769 + }, + { + "epoch": 2.1299532365247353, + "grad_norm": 0.8587403893470764, + "learning_rate": 3.627081940898085e-06, + "loss": 0.6134, + "step": 5770 + }, + { + "epoch": 2.1303224218557717, + "grad_norm": 0.8513883352279663, + "learning_rate": 3.626647777249404e-06, + "loss": 0.6021, + "step": 5771 + }, + { + "epoch": 2.1306916071868076, + "grad_norm": 0.8922625184059143, + "learning_rate": 3.62621357095746e-06, + "loss": 0.601, + "step": 5772 + }, + { + "epoch": 2.131060792517844, + "grad_norm": 0.8900362849235535, + "learning_rate": 3.6257793220386876e-06, + "loss": 0.5935, + "step": 5773 + }, + { + "epoch": 2.1314299778488803, + "grad_norm": 0.875726580619812, + "learning_rate": 3.625345030509524e-06, + "loss": 0.62, + "step": 5774 + }, + { + "epoch": 2.1317991631799162, + "grad_norm": 0.8733262419700623, + "learning_rate": 3.624910696386405e-06, + "loss": 0.5937, + "step": 5775 + }, + { + "epoch": 2.1321683485109526, + "grad_norm": 0.852220892906189, + "learning_rate": 3.6244763196857714e-06, + "loss": 0.665, + "step": 5776 + }, + { + "epoch": 2.1325375338419885, + "grad_norm": 0.9115581512451172, + "learning_rate": 3.6240419004240644e-06, + "loss": 0.6323, + "step": 5777 + }, + { + "epoch": 2.132906719173025, + "grad_norm": 0.8866631388664246, + "learning_rate": 3.6236074386177263e-06, + "loss": 0.6094, + "step": 5778 + }, + { + "epoch": 2.1332759045040612, + "grad_norm": 0.9196190237998962, + "learning_rate": 3.6231729342832006e-06, + "loss": 0.6124, + "step": 5779 + }, + { + "epoch": 2.133645089835097, + "grad_norm": 0.8615099191665649, + "learning_rate": 3.6227383874369344e-06, + "loss": 0.6233, + "step": 5780 + }, + { + "epoch": 2.1340142751661335, + "grad_norm": 0.9160125255584717, + "learning_rate": 3.622303798095374e-06, + "loss": 0.6205, + "step": 5781 + }, + { + "epoch": 2.1343834604971694, + "grad_norm": 0.868421196937561, + "learning_rate": 3.62186916627497e-06, + "loss": 0.6194, + "step": 5782 + }, + { + "epoch": 2.134752645828206, + "grad_norm": 0.8679133057594299, + "learning_rate": 3.6214344919921712e-06, + "loss": 0.5877, + "step": 5783 + }, + { + "epoch": 2.135121831159242, + "grad_norm": 0.8637129068374634, + "learning_rate": 3.6209997752634318e-06, + "loss": 0.6343, + "step": 5784 + }, + { + "epoch": 2.135491016490278, + "grad_norm": 0.8744029402732849, + "learning_rate": 3.6205650161052043e-06, + "loss": 0.5937, + "step": 5785 + }, + { + "epoch": 2.1358602018213144, + "grad_norm": 0.8856083154678345, + "learning_rate": 3.620130214533944e-06, + "loss": 0.6114, + "step": 5786 + }, + { + "epoch": 2.1362293871523503, + "grad_norm": 0.9382786750793457, + "learning_rate": 3.6196953705661087e-06, + "loss": 0.6271, + "step": 5787 + }, + { + "epoch": 2.1365985724833867, + "grad_norm": 0.8717965483665466, + "learning_rate": 3.6192604842181576e-06, + "loss": 0.5699, + "step": 5788 + }, + { + "epoch": 2.136967757814423, + "grad_norm": 0.8848774433135986, + "learning_rate": 3.61882555550655e-06, + "loss": 0.6001, + "step": 5789 + }, + { + "epoch": 2.137336943145459, + "grad_norm": 0.8886960744857788, + "learning_rate": 3.618390584447748e-06, + "loss": 0.5985, + "step": 5790 + }, + { + "epoch": 2.1377061284764953, + "grad_norm": 0.9042415618896484, + "learning_rate": 3.617955571058216e-06, + "loss": 0.6395, + "step": 5791 + }, + { + "epoch": 2.1380753138075312, + "grad_norm": 0.8853775858879089, + "learning_rate": 3.617520515354418e-06, + "loss": 0.6299, + "step": 5792 + }, + { + "epoch": 2.1384444991385676, + "grad_norm": 0.8737713098526001, + "learning_rate": 3.6170854173528214e-06, + "loss": 0.6105, + "step": 5793 + }, + { + "epoch": 2.1388136844696035, + "grad_norm": 0.8731061816215515, + "learning_rate": 3.6166502770698946e-06, + "loss": 0.6591, + "step": 5794 + }, + { + "epoch": 2.13918286980064, + "grad_norm": 0.8768091201782227, + "learning_rate": 3.6162150945221066e-06, + "loss": 0.6307, + "step": 5795 + }, + { + "epoch": 2.1395520551316762, + "grad_norm": 0.9193796515464783, + "learning_rate": 3.6157798697259296e-06, + "loss": 0.622, + "step": 5796 + }, + { + "epoch": 2.139921240462712, + "grad_norm": 0.8786208033561707, + "learning_rate": 3.6153446026978367e-06, + "loss": 0.6012, + "step": 5797 + }, + { + "epoch": 2.1402904257937485, + "grad_norm": 0.9239831566810608, + "learning_rate": 3.6149092934543036e-06, + "loss": 0.6473, + "step": 5798 + }, + { + "epoch": 2.140659611124785, + "grad_norm": 0.9001717567443848, + "learning_rate": 3.6144739420118047e-06, + "loss": 0.63, + "step": 5799 + }, + { + "epoch": 2.141028796455821, + "grad_norm": 0.8809717893600464, + "learning_rate": 3.6140385483868185e-06, + "loss": 0.6219, + "step": 5800 + }, + { + "epoch": 2.141397981786857, + "grad_norm": 0.8653585910797119, + "learning_rate": 3.613603112595825e-06, + "loss": 0.5869, + "step": 5801 + }, + { + "epoch": 2.141767167117893, + "grad_norm": 0.8630864024162292, + "learning_rate": 3.613167634655306e-06, + "loss": 0.6358, + "step": 5802 + }, + { + "epoch": 2.1421363524489294, + "grad_norm": 0.914350688457489, + "learning_rate": 3.612732114581743e-06, + "loss": 0.6483, + "step": 5803 + }, + { + "epoch": 2.1425055377799653, + "grad_norm": 0.920939028263092, + "learning_rate": 3.6122965523916205e-06, + "loss": 0.6267, + "step": 5804 + }, + { + "epoch": 2.1428747231110017, + "grad_norm": 0.8817105889320374, + "learning_rate": 3.611860948101424e-06, + "loss": 0.6034, + "step": 5805 + }, + { + "epoch": 2.143243908442038, + "grad_norm": 0.8906303644180298, + "learning_rate": 3.6114253017276424e-06, + "loss": 0.602, + "step": 5806 + }, + { + "epoch": 2.143613093773074, + "grad_norm": 0.9070355892181396, + "learning_rate": 3.6109896132867627e-06, + "loss": 0.6518, + "step": 5807 + }, + { + "epoch": 2.1439822791041103, + "grad_norm": 0.8947898149490356, + "learning_rate": 3.610553882795278e-06, + "loss": 0.6295, + "step": 5808 + }, + { + "epoch": 2.1443514644351462, + "grad_norm": 0.8820807337760925, + "learning_rate": 3.610118110269679e-06, + "loss": 0.6028, + "step": 5809 + }, + { + "epoch": 2.1447206497661826, + "grad_norm": 0.8962070941925049, + "learning_rate": 3.6096822957264597e-06, + "loss": 0.559, + "step": 5810 + }, + { + "epoch": 2.145089835097219, + "grad_norm": 0.8476348519325256, + "learning_rate": 3.609246439182116e-06, + "loss": 0.6153, + "step": 5811 + }, + { + "epoch": 2.145459020428255, + "grad_norm": 0.8656748533248901, + "learning_rate": 3.6088105406531443e-06, + "loss": 0.5769, + "step": 5812 + }, + { + "epoch": 2.1458282057592912, + "grad_norm": 0.9053694009780884, + "learning_rate": 3.6083746001560436e-06, + "loss": 0.6223, + "step": 5813 + }, + { + "epoch": 2.146197391090327, + "grad_norm": 0.8789047002792358, + "learning_rate": 3.607938617707314e-06, + "loss": 0.6132, + "step": 5814 + }, + { + "epoch": 2.1465665764213635, + "grad_norm": 0.9181821942329407, + "learning_rate": 3.6075025933234575e-06, + "loss": 0.6385, + "step": 5815 + }, + { + "epoch": 2.1469357617524, + "grad_norm": 0.891006350517273, + "learning_rate": 3.607066527020978e-06, + "loss": 0.652, + "step": 5816 + }, + { + "epoch": 2.1473049470834358, + "grad_norm": 0.8841145634651184, + "learning_rate": 3.606630418816379e-06, + "loss": 0.6193, + "step": 5817 + }, + { + "epoch": 2.147674132414472, + "grad_norm": 0.8691807985305786, + "learning_rate": 3.606194268726169e-06, + "loss": 0.6027, + "step": 5818 + }, + { + "epoch": 2.148043317745508, + "grad_norm": 0.9074222445487976, + "learning_rate": 3.605758076766853e-06, + "loss": 0.6341, + "step": 5819 + }, + { + "epoch": 2.1484125030765444, + "grad_norm": 0.8699797987937927, + "learning_rate": 3.6053218429549452e-06, + "loss": 0.58, + "step": 5820 + }, + { + "epoch": 2.1487816884075808, + "grad_norm": 0.8912034034729004, + "learning_rate": 3.6048855673069527e-06, + "loss": 0.6166, + "step": 5821 + }, + { + "epoch": 2.1491508737386167, + "grad_norm": 0.861272931098938, + "learning_rate": 3.6044492498393914e-06, + "loss": 0.5858, + "step": 5822 + }, + { + "epoch": 2.149520059069653, + "grad_norm": 0.8903756141662598, + "learning_rate": 3.604012890568774e-06, + "loss": 0.6479, + "step": 5823 + }, + { + "epoch": 2.149889244400689, + "grad_norm": 0.8830152153968811, + "learning_rate": 3.603576489511617e-06, + "loss": 0.605, + "step": 5824 + }, + { + "epoch": 2.1502584297317253, + "grad_norm": 0.8651109933853149, + "learning_rate": 3.603140046684438e-06, + "loss": 0.6177, + "step": 5825 + }, + { + "epoch": 2.1506276150627617, + "grad_norm": 0.8977739810943604, + "learning_rate": 3.602703562103757e-06, + "loss": 0.6375, + "step": 5826 + }, + { + "epoch": 2.1509968003937976, + "grad_norm": 0.8896883726119995, + "learning_rate": 3.6022670357860938e-06, + "loss": 0.66, + "step": 5827 + }, + { + "epoch": 2.151365985724834, + "grad_norm": 0.9040690064430237, + "learning_rate": 3.601830467747972e-06, + "loss": 0.6607, + "step": 5828 + }, + { + "epoch": 2.15173517105587, + "grad_norm": 0.8762277364730835, + "learning_rate": 3.601393858005913e-06, + "loss": 0.6367, + "step": 5829 + }, + { + "epoch": 2.1521043563869062, + "grad_norm": 0.8819403648376465, + "learning_rate": 3.600957206576445e-06, + "loss": 0.5552, + "step": 5830 + }, + { + "epoch": 2.1524735417179426, + "grad_norm": 0.8948053121566772, + "learning_rate": 3.6005205134760946e-06, + "loss": 0.6468, + "step": 5831 + }, + { + "epoch": 2.1528427270489785, + "grad_norm": 0.8769354820251465, + "learning_rate": 3.6000837787213903e-06, + "loss": 0.6243, + "step": 5832 + }, + { + "epoch": 2.153211912380015, + "grad_norm": 0.9164648056030273, + "learning_rate": 3.599647002328861e-06, + "loss": 0.6018, + "step": 5833 + }, + { + "epoch": 2.1535810977110508, + "grad_norm": 0.8499975204467773, + "learning_rate": 3.5992101843150406e-06, + "loss": 0.6291, + "step": 5834 + }, + { + "epoch": 2.153950283042087, + "grad_norm": 0.8859479427337646, + "learning_rate": 3.5987733246964614e-06, + "loss": 0.6066, + "step": 5835 + }, + { + "epoch": 2.1543194683731235, + "grad_norm": 0.854315996170044, + "learning_rate": 3.5983364234896587e-06, + "loss": 0.6036, + "step": 5836 + }, + { + "epoch": 2.1546886537041594, + "grad_norm": 0.8854399919509888, + "learning_rate": 3.597899480711169e-06, + "loss": 0.6369, + "step": 5837 + }, + { + "epoch": 2.1550578390351958, + "grad_norm": 0.8746028542518616, + "learning_rate": 3.597462496377531e-06, + "loss": 0.6151, + "step": 5838 + }, + { + "epoch": 2.1554270243662317, + "grad_norm": 0.8895490169525146, + "learning_rate": 3.5970254705052826e-06, + "loss": 0.626, + "step": 5839 + }, + { + "epoch": 2.155796209697268, + "grad_norm": 0.9036489129066467, + "learning_rate": 3.5965884031109672e-06, + "loss": 0.6221, + "step": 5840 + }, + { + "epoch": 2.1561653950283044, + "grad_norm": 0.8995091319084167, + "learning_rate": 3.5961512942111265e-06, + "loss": 0.6325, + "step": 5841 + }, + { + "epoch": 2.1565345803593403, + "grad_norm": 0.8575221300125122, + "learning_rate": 3.595714143822306e-06, + "loss": 0.6016, + "step": 5842 + }, + { + "epoch": 2.1569037656903767, + "grad_norm": 0.8883785605430603, + "learning_rate": 3.5952769519610496e-06, + "loss": 0.6004, + "step": 5843 + }, + { + "epoch": 2.1572729510214126, + "grad_norm": 0.8528056144714355, + "learning_rate": 3.594839718643908e-06, + "loss": 0.6274, + "step": 5844 + }, + { + "epoch": 2.157642136352449, + "grad_norm": 0.8883569836616516, + "learning_rate": 3.594402443887427e-06, + "loss": 0.647, + "step": 5845 + }, + { + "epoch": 2.1580113216834853, + "grad_norm": 0.881409764289856, + "learning_rate": 3.59396512770816e-06, + "loss": 0.6029, + "step": 5846 + }, + { + "epoch": 2.158380507014521, + "grad_norm": 0.8778812885284424, + "learning_rate": 3.5935277701226577e-06, + "loss": 0.6002, + "step": 5847 + }, + { + "epoch": 2.1587496923455576, + "grad_norm": 0.8820952773094177, + "learning_rate": 3.5930903711474742e-06, + "loss": 0.6234, + "step": 5848 + }, + { + "epoch": 2.1591188776765935, + "grad_norm": 0.8924210071563721, + "learning_rate": 3.5926529307991655e-06, + "loss": 0.607, + "step": 5849 + }, + { + "epoch": 2.15948806300763, + "grad_norm": 0.8851708769798279, + "learning_rate": 3.5922154490942884e-06, + "loss": 0.6122, + "step": 5850 + }, + { + "epoch": 2.159857248338666, + "grad_norm": 0.9235275983810425, + "learning_rate": 3.5917779260494005e-06, + "loss": 0.6659, + "step": 5851 + }, + { + "epoch": 2.160226433669702, + "grad_norm": 0.875877857208252, + "learning_rate": 3.5913403616810628e-06, + "loss": 0.6192, + "step": 5852 + }, + { + "epoch": 2.1605956190007385, + "grad_norm": 0.8925716280937195, + "learning_rate": 3.5909027560058374e-06, + "loss": 0.5851, + "step": 5853 + }, + { + "epoch": 2.1609648043317744, + "grad_norm": 0.8848718404769897, + "learning_rate": 3.590465109040287e-06, + "loss": 0.6038, + "step": 5854 + }, + { + "epoch": 2.1613339896628108, + "grad_norm": 0.9323797821998596, + "learning_rate": 3.5900274208009763e-06, + "loss": 0.6556, + "step": 5855 + }, + { + "epoch": 2.161703174993847, + "grad_norm": 0.8813681602478027, + "learning_rate": 3.5895896913044714e-06, + "loss": 0.6474, + "step": 5856 + }, + { + "epoch": 2.162072360324883, + "grad_norm": 0.866142988204956, + "learning_rate": 3.5891519205673405e-06, + "loss": 0.6027, + "step": 5857 + }, + { + "epoch": 2.1624415456559194, + "grad_norm": 0.878430187702179, + "learning_rate": 3.5887141086061533e-06, + "loss": 0.6368, + "step": 5858 + }, + { + "epoch": 2.1628107309869553, + "grad_norm": 0.8546287417411804, + "learning_rate": 3.5882762554374806e-06, + "loss": 0.5709, + "step": 5859 + }, + { + "epoch": 2.1631799163179917, + "grad_norm": 0.9033393263816833, + "learning_rate": 3.587838361077896e-06, + "loss": 0.6079, + "step": 5860 + }, + { + "epoch": 2.1635491016490276, + "grad_norm": 0.8686055541038513, + "learning_rate": 3.587400425543972e-06, + "loss": 0.6113, + "step": 5861 + }, + { + "epoch": 2.163918286980064, + "grad_norm": 0.8797454833984375, + "learning_rate": 3.586962448852285e-06, + "loss": 0.6271, + "step": 5862 + }, + { + "epoch": 2.1642874723111003, + "grad_norm": 0.8955085277557373, + "learning_rate": 3.586524431019412e-06, + "loss": 0.6216, + "step": 5863 + }, + { + "epoch": 2.164656657642136, + "grad_norm": 0.8696057200431824, + "learning_rate": 3.5860863720619333e-06, + "loss": 0.6181, + "step": 5864 + }, + { + "epoch": 2.1650258429731726, + "grad_norm": 0.8754781484603882, + "learning_rate": 3.585648271996427e-06, + "loss": 0.6268, + "step": 5865 + }, + { + "epoch": 2.165395028304209, + "grad_norm": 0.8671129941940308, + "learning_rate": 3.585210130839477e-06, + "loss": 0.5973, + "step": 5866 + }, + { + "epoch": 2.165764213635245, + "grad_norm": 0.9030606150627136, + "learning_rate": 3.5847719486076655e-06, + "loss": 0.6374, + "step": 5867 + }, + { + "epoch": 2.166133398966281, + "grad_norm": 0.8802056908607483, + "learning_rate": 3.5843337253175793e-06, + "loss": 0.6329, + "step": 5868 + }, + { + "epoch": 2.166502584297317, + "grad_norm": 0.8997842669487, + "learning_rate": 3.5838954609858025e-06, + "loss": 0.6322, + "step": 5869 + }, + { + "epoch": 2.1668717696283535, + "grad_norm": 0.8999974727630615, + "learning_rate": 3.5834571556289255e-06, + "loss": 0.6416, + "step": 5870 + }, + { + "epoch": 2.1672409549593894, + "grad_norm": 0.8777790069580078, + "learning_rate": 3.583018809263536e-06, + "loss": 0.5663, + "step": 5871 + }, + { + "epoch": 2.1676101402904258, + "grad_norm": 0.8690145015716553, + "learning_rate": 3.5825804219062272e-06, + "loss": 0.6328, + "step": 5872 + }, + { + "epoch": 2.167979325621462, + "grad_norm": 0.8719207048416138, + "learning_rate": 3.5821419935735905e-06, + "loss": 0.6242, + "step": 5873 + }, + { + "epoch": 2.168348510952498, + "grad_norm": 0.9131308197975159, + "learning_rate": 3.5817035242822217e-06, + "loss": 0.6194, + "step": 5874 + }, + { + "epoch": 2.1687176962835344, + "grad_norm": 0.8941230773925781, + "learning_rate": 3.581265014048716e-06, + "loss": 0.626, + "step": 5875 + }, + { + "epoch": 2.1690868816145707, + "grad_norm": 0.9238820672035217, + "learning_rate": 3.5808264628896698e-06, + "loss": 0.6451, + "step": 5876 + }, + { + "epoch": 2.1694560669456067, + "grad_norm": 0.9075450897216797, + "learning_rate": 3.5803878708216833e-06, + "loss": 0.6016, + "step": 5877 + }, + { + "epoch": 2.169825252276643, + "grad_norm": 0.8912283182144165, + "learning_rate": 3.5799492378613576e-06, + "loss": 0.6042, + "step": 5878 + }, + { + "epoch": 2.170194437607679, + "grad_norm": 0.852512538433075, + "learning_rate": 3.5795105640252937e-06, + "loss": 0.6347, + "step": 5879 + }, + { + "epoch": 2.1705636229387153, + "grad_norm": 0.8797526955604553, + "learning_rate": 3.579071849330096e-06, + "loss": 0.5946, + "step": 5880 + }, + { + "epoch": 2.170932808269751, + "grad_norm": 0.888956606388092, + "learning_rate": 3.578633093792368e-06, + "loss": 0.6195, + "step": 5881 + }, + { + "epoch": 2.1713019936007876, + "grad_norm": 0.8728255033493042, + "learning_rate": 3.5781942974287192e-06, + "loss": 0.6083, + "step": 5882 + }, + { + "epoch": 2.171671178931824, + "grad_norm": 0.8616536259651184, + "learning_rate": 3.5777554602557557e-06, + "loss": 0.6062, + "step": 5883 + }, + { + "epoch": 2.17204036426286, + "grad_norm": 0.864197850227356, + "learning_rate": 3.577316582290089e-06, + "loss": 0.6071, + "step": 5884 + }, + { + "epoch": 2.172409549593896, + "grad_norm": 0.8982044458389282, + "learning_rate": 3.5768776635483293e-06, + "loss": 0.6167, + "step": 5885 + }, + { + "epoch": 2.172778734924932, + "grad_norm": 0.894494354724884, + "learning_rate": 3.5764387040470895e-06, + "loss": 0.6241, + "step": 5886 + }, + { + "epoch": 2.1731479202559685, + "grad_norm": 0.8926079273223877, + "learning_rate": 3.5759997038029847e-06, + "loss": 0.6447, + "step": 5887 + }, + { + "epoch": 2.173517105587005, + "grad_norm": 0.8943877220153809, + "learning_rate": 3.575560662832631e-06, + "loss": 0.5919, + "step": 5888 + }, + { + "epoch": 2.1738862909180408, + "grad_norm": 0.9142517447471619, + "learning_rate": 3.575121581152645e-06, + "loss": 0.6185, + "step": 5889 + }, + { + "epoch": 2.174255476249077, + "grad_norm": 0.9140036702156067, + "learning_rate": 3.5746824587796468e-06, + "loss": 0.6495, + "step": 5890 + }, + { + "epoch": 2.174624661580113, + "grad_norm": 0.8992201089859009, + "learning_rate": 3.574243295730257e-06, + "loss": 0.6187, + "step": 5891 + }, + { + "epoch": 2.1749938469111494, + "grad_norm": 0.8663465976715088, + "learning_rate": 3.5738040920210983e-06, + "loss": 0.6011, + "step": 5892 + }, + { + "epoch": 2.1753630322421857, + "grad_norm": 0.8849654793739319, + "learning_rate": 3.5733648476687923e-06, + "loss": 0.6151, + "step": 5893 + }, + { + "epoch": 2.1757322175732217, + "grad_norm": 0.932027280330658, + "learning_rate": 3.5729255626899663e-06, + "loss": 0.6481, + "step": 5894 + }, + { + "epoch": 2.176101402904258, + "grad_norm": 0.8646990060806274, + "learning_rate": 3.5724862371012446e-06, + "loss": 0.6313, + "step": 5895 + }, + { + "epoch": 2.176470588235294, + "grad_norm": 0.892325758934021, + "learning_rate": 3.5720468709192597e-06, + "loss": 0.6372, + "step": 5896 + }, + { + "epoch": 2.1768397735663303, + "grad_norm": 0.8842151165008545, + "learning_rate": 3.571607464160638e-06, + "loss": 0.6089, + "step": 5897 + }, + { + "epoch": 2.1772089588973667, + "grad_norm": 0.8594691753387451, + "learning_rate": 3.571168016842012e-06, + "loss": 0.631, + "step": 5898 + }, + { + "epoch": 2.1775781442284026, + "grad_norm": 0.8790596127510071, + "learning_rate": 3.5707285289800147e-06, + "loss": 0.6198, + "step": 5899 + }, + { + "epoch": 2.177947329559439, + "grad_norm": 0.8807023167610168, + "learning_rate": 3.5702890005912805e-06, + "loss": 0.6208, + "step": 5900 + }, + { + "epoch": 2.178316514890475, + "grad_norm": 0.9123364090919495, + "learning_rate": 3.569849431692445e-06, + "loss": 0.645, + "step": 5901 + }, + { + "epoch": 2.178685700221511, + "grad_norm": 0.9749924540519714, + "learning_rate": 3.5694098223001473e-06, + "loss": 0.5873, + "step": 5902 + }, + { + "epoch": 2.1790548855525476, + "grad_norm": 0.9109043478965759, + "learning_rate": 3.5689701724310244e-06, + "loss": 0.6185, + "step": 5903 + }, + { + "epoch": 2.1794240708835835, + "grad_norm": 0.8786762952804565, + "learning_rate": 3.5685304821017185e-06, + "loss": 0.6075, + "step": 5904 + }, + { + "epoch": 2.17979325621462, + "grad_norm": 0.8673607707023621, + "learning_rate": 3.5680907513288696e-06, + "loss": 0.6261, + "step": 5905 + }, + { + "epoch": 2.1801624415456557, + "grad_norm": 0.8824069499969482, + "learning_rate": 3.5676509801291245e-06, + "loss": 0.5816, + "step": 5906 + }, + { + "epoch": 2.180531626876692, + "grad_norm": 0.8478742837905884, + "learning_rate": 3.567211168519126e-06, + "loss": 0.5955, + "step": 5907 + }, + { + "epoch": 2.1809008122077285, + "grad_norm": 0.8771120309829712, + "learning_rate": 3.566771316515522e-06, + "loss": 0.6396, + "step": 5908 + }, + { + "epoch": 2.1812699975387644, + "grad_norm": 0.8878215551376343, + "learning_rate": 3.5663314241349596e-06, + "loss": 0.6529, + "step": 5909 + }, + { + "epoch": 2.1816391828698007, + "grad_norm": 0.876442015171051, + "learning_rate": 3.56589149139409e-06, + "loss": 0.627, + "step": 5910 + }, + { + "epoch": 2.1820083682008367, + "grad_norm": 0.8779415488243103, + "learning_rate": 3.565451518309564e-06, + "loss": 0.5873, + "step": 5911 + }, + { + "epoch": 2.182377553531873, + "grad_norm": 0.8630866408348083, + "learning_rate": 3.5650115048980336e-06, + "loss": 0.6216, + "step": 5912 + }, + { + "epoch": 2.1827467388629094, + "grad_norm": 0.8901382684707642, + "learning_rate": 3.564571451176154e-06, + "loss": 0.5978, + "step": 5913 + }, + { + "epoch": 2.1831159241939453, + "grad_norm": 0.8931598663330078, + "learning_rate": 3.564131357160582e-06, + "loss": 0.5777, + "step": 5914 + }, + { + "epoch": 2.1834851095249816, + "grad_norm": 0.9108725786209106, + "learning_rate": 3.5636912228679724e-06, + "loss": 0.6347, + "step": 5915 + }, + { + "epoch": 2.1838542948560176, + "grad_norm": 0.9001242518424988, + "learning_rate": 3.5632510483149875e-06, + "loss": 0.6063, + "step": 5916 + }, + { + "epoch": 2.184223480187054, + "grad_norm": 0.8889416456222534, + "learning_rate": 3.5628108335182844e-06, + "loss": 0.6448, + "step": 5917 + }, + { + "epoch": 2.1845926655180903, + "grad_norm": 0.8775550723075867, + "learning_rate": 3.5623705784945272e-06, + "loss": 0.6069, + "step": 5918 + }, + { + "epoch": 2.184961850849126, + "grad_norm": 0.8771174550056458, + "learning_rate": 3.5619302832603788e-06, + "loss": 0.6182, + "step": 5919 + }, + { + "epoch": 2.1853310361801626, + "grad_norm": 0.895348846912384, + "learning_rate": 3.561489947832505e-06, + "loss": 0.6288, + "step": 5920 + }, + { + "epoch": 2.1857002215111985, + "grad_norm": 0.9067697525024414, + "learning_rate": 3.5610495722275708e-06, + "loss": 0.6274, + "step": 5921 + }, + { + "epoch": 2.186069406842235, + "grad_norm": 0.8828921914100647, + "learning_rate": 3.560609156462246e-06, + "loss": 0.6297, + "step": 5922 + }, + { + "epoch": 2.186438592173271, + "grad_norm": 0.9005772471427917, + "learning_rate": 3.5601687005531987e-06, + "loss": 0.6375, + "step": 5923 + }, + { + "epoch": 2.186807777504307, + "grad_norm": 0.8798536062240601, + "learning_rate": 3.5597282045171007e-06, + "loss": 0.5962, + "step": 5924 + }, + { + "epoch": 2.1871769628353435, + "grad_norm": 0.8572691679000854, + "learning_rate": 3.5592876683706245e-06, + "loss": 0.6221, + "step": 5925 + }, + { + "epoch": 2.1875461481663794, + "grad_norm": 0.9058374762535095, + "learning_rate": 3.558847092130445e-06, + "loss": 0.6259, + "step": 5926 + }, + { + "epoch": 2.1879153334974157, + "grad_norm": 0.8837125897407532, + "learning_rate": 3.558406475813237e-06, + "loss": 0.6318, + "step": 5927 + }, + { + "epoch": 2.1882845188284517, + "grad_norm": 0.8858901262283325, + "learning_rate": 3.5579658194356776e-06, + "loss": 0.6621, + "step": 5928 + }, + { + "epoch": 2.188653704159488, + "grad_norm": 0.8861137628555298, + "learning_rate": 3.5575251230144457e-06, + "loss": 0.5938, + "step": 5929 + }, + { + "epoch": 2.1890228894905244, + "grad_norm": 0.873225212097168, + "learning_rate": 3.5570843865662226e-06, + "loss": 0.5705, + "step": 5930 + }, + { + "epoch": 2.1893920748215603, + "grad_norm": 0.8638142943382263, + "learning_rate": 3.5566436101076884e-06, + "loss": 0.6118, + "step": 5931 + }, + { + "epoch": 2.1897612601525966, + "grad_norm": 0.9096303582191467, + "learning_rate": 3.5562027936555275e-06, + "loss": 0.629, + "step": 5932 + }, + { + "epoch": 2.190130445483633, + "grad_norm": 0.9279538989067078, + "learning_rate": 3.555761937226423e-06, + "loss": 0.6318, + "step": 5933 + }, + { + "epoch": 2.190499630814669, + "grad_norm": 0.8614236116409302, + "learning_rate": 3.5553210408370635e-06, + "loss": 0.6044, + "step": 5934 + }, + { + "epoch": 2.1908688161457053, + "grad_norm": 0.8802698850631714, + "learning_rate": 3.5548801045041352e-06, + "loss": 0.6442, + "step": 5935 + }, + { + "epoch": 2.191238001476741, + "grad_norm": 0.898713231086731, + "learning_rate": 3.5544391282443286e-06, + "loss": 0.6253, + "step": 5936 + }, + { + "epoch": 2.1916071868077776, + "grad_norm": 0.8857128024101257, + "learning_rate": 3.553998112074333e-06, + "loss": 0.639, + "step": 5937 + }, + { + "epoch": 2.1919763721388135, + "grad_norm": 0.881545901298523, + "learning_rate": 3.553557056010842e-06, + "loss": 0.6223, + "step": 5938 + }, + { + "epoch": 2.19234555746985, + "grad_norm": 0.9134512543678284, + "learning_rate": 3.5531159600705483e-06, + "loss": 0.6115, + "step": 5939 + }, + { + "epoch": 2.192714742800886, + "grad_norm": 0.9085601568222046, + "learning_rate": 3.5526748242701485e-06, + "loss": 0.6217, + "step": 5940 + }, + { + "epoch": 2.193083928131922, + "grad_norm": 0.8944205045700073, + "learning_rate": 3.5522336486263387e-06, + "loss": 0.6239, + "step": 5941 + }, + { + "epoch": 2.1934531134629585, + "grad_norm": 0.9071425199508667, + "learning_rate": 3.5517924331558172e-06, + "loss": 0.6, + "step": 5942 + }, + { + "epoch": 2.193822298793995, + "grad_norm": 0.9298303127288818, + "learning_rate": 3.551351177875284e-06, + "loss": 0.6669, + "step": 5943 + }, + { + "epoch": 2.1941914841250307, + "grad_norm": 0.8743665814399719, + "learning_rate": 3.550909882801441e-06, + "loss": 0.5666, + "step": 5944 + }, + { + "epoch": 2.194560669456067, + "grad_norm": 0.8759313225746155, + "learning_rate": 3.55046854795099e-06, + "loss": 0.6108, + "step": 5945 + }, + { + "epoch": 2.194929854787103, + "grad_norm": 0.9174638390541077, + "learning_rate": 3.5500271733406365e-06, + "loss": 0.6011, + "step": 5946 + }, + { + "epoch": 2.1952990401181394, + "grad_norm": 0.8793237209320068, + "learning_rate": 3.5495857589870853e-06, + "loss": 0.6011, + "step": 5947 + }, + { + "epoch": 2.1956682254491753, + "grad_norm": 0.8749545216560364, + "learning_rate": 3.549144304907045e-06, + "loss": 0.6348, + "step": 5948 + }, + { + "epoch": 2.1960374107802116, + "grad_norm": 0.8775364756584167, + "learning_rate": 3.5487028111172236e-06, + "loss": 0.6355, + "step": 5949 + }, + { + "epoch": 2.196406596111248, + "grad_norm": 0.898937463760376, + "learning_rate": 3.5482612776343315e-06, + "loss": 0.5926, + "step": 5950 + }, + { + "epoch": 2.196775781442284, + "grad_norm": 0.8911599516868591, + "learning_rate": 3.5478197044750818e-06, + "loss": 0.617, + "step": 5951 + }, + { + "epoch": 2.1971449667733203, + "grad_norm": 0.8917903900146484, + "learning_rate": 3.547378091656186e-06, + "loss": 0.6407, + "step": 5952 + }, + { + "epoch": 2.197514152104356, + "grad_norm": 0.8685060739517212, + "learning_rate": 3.54693643919436e-06, + "loss": 0.6475, + "step": 5953 + }, + { + "epoch": 2.1978833374353925, + "grad_norm": 0.8528156876564026, + "learning_rate": 3.5464947471063212e-06, + "loss": 0.5924, + "step": 5954 + }, + { + "epoch": 2.198252522766429, + "grad_norm": 0.8717610836029053, + "learning_rate": 3.546053015408785e-06, + "loss": 0.6271, + "step": 5955 + }, + { + "epoch": 2.198621708097465, + "grad_norm": 0.870945394039154, + "learning_rate": 3.545611244118474e-06, + "loss": 0.665, + "step": 5956 + }, + { + "epoch": 2.198990893428501, + "grad_norm": 0.8990254402160645, + "learning_rate": 3.5451694332521057e-06, + "loss": 0.6018, + "step": 5957 + }, + { + "epoch": 2.199360078759537, + "grad_norm": 0.9900867342948914, + "learning_rate": 3.5447275828264056e-06, + "loss": 0.6004, + "step": 5958 + }, + { + "epoch": 2.1997292640905735, + "grad_norm": 0.8898429274559021, + "learning_rate": 3.5442856928580955e-06, + "loss": 0.6296, + "step": 5959 + }, + { + "epoch": 2.20009844942161, + "grad_norm": 0.8795872926712036, + "learning_rate": 3.543843763363902e-06, + "loss": 0.6105, + "step": 5960 + }, + { + "epoch": 2.2004676347526457, + "grad_norm": 0.8719852566719055, + "learning_rate": 3.5434017943605508e-06, + "loss": 0.6053, + "step": 5961 + }, + { + "epoch": 2.200836820083682, + "grad_norm": 0.8981032371520996, + "learning_rate": 3.5429597858647714e-06, + "loss": 0.661, + "step": 5962 + }, + { + "epoch": 2.201206005414718, + "grad_norm": 0.8964576721191406, + "learning_rate": 3.542517737893293e-06, + "loss": 0.6288, + "step": 5963 + }, + { + "epoch": 2.2015751907457544, + "grad_norm": 0.8625016808509827, + "learning_rate": 3.542075650462848e-06, + "loss": 0.5969, + "step": 5964 + }, + { + "epoch": 2.2019443760767907, + "grad_norm": 0.8780565857887268, + "learning_rate": 3.541633523590168e-06, + "loss": 0.6087, + "step": 5965 + }, + { + "epoch": 2.2023135614078266, + "grad_norm": 0.9387151598930359, + "learning_rate": 3.5411913572919875e-06, + "loss": 0.6588, + "step": 5966 + }, + { + "epoch": 2.202682746738863, + "grad_norm": 0.887510359287262, + "learning_rate": 3.540749151585043e-06, + "loss": 0.6115, + "step": 5967 + }, + { + "epoch": 2.203051932069899, + "grad_norm": 0.910335123538971, + "learning_rate": 3.5403069064860716e-06, + "loss": 0.6337, + "step": 5968 + }, + { + "epoch": 2.2034211174009353, + "grad_norm": 0.9075178503990173, + "learning_rate": 3.5398646220118127e-06, + "loss": 0.6531, + "step": 5969 + }, + { + "epoch": 2.2037903027319716, + "grad_norm": 0.8599549531936646, + "learning_rate": 3.5394222981790053e-06, + "loss": 0.6011, + "step": 5970 + }, + { + "epoch": 2.2041594880630075, + "grad_norm": 0.8891392350196838, + "learning_rate": 3.538979935004392e-06, + "loss": 0.615, + "step": 5971 + }, + { + "epoch": 2.204528673394044, + "grad_norm": 0.893242597579956, + "learning_rate": 3.5385375325047167e-06, + "loss": 0.6217, + "step": 5972 + }, + { + "epoch": 2.20489785872508, + "grad_norm": 0.8650193214416504, + "learning_rate": 3.5380950906967225e-06, + "loss": 0.6131, + "step": 5973 + }, + { + "epoch": 2.205267044056116, + "grad_norm": 0.8893565535545349, + "learning_rate": 3.537652609597158e-06, + "loss": 0.6247, + "step": 5974 + }, + { + "epoch": 2.2056362293871525, + "grad_norm": 0.8953511118888855, + "learning_rate": 3.537210089222769e-06, + "loss": 0.6402, + "step": 5975 + }, + { + "epoch": 2.2060054147181885, + "grad_norm": 0.889409601688385, + "learning_rate": 3.5367675295903053e-06, + "loss": 0.6238, + "step": 5976 + }, + { + "epoch": 2.206374600049225, + "grad_norm": 0.9118937849998474, + "learning_rate": 3.5363249307165182e-06, + "loss": 0.6154, + "step": 5977 + }, + { + "epoch": 2.2067437853802607, + "grad_norm": 0.8851687908172607, + "learning_rate": 3.535882292618159e-06, + "loss": 0.6116, + "step": 5978 + }, + { + "epoch": 2.207112970711297, + "grad_norm": 0.8776278495788574, + "learning_rate": 3.5354396153119824e-06, + "loss": 0.6557, + "step": 5979 + }, + { + "epoch": 2.2074821560423334, + "grad_norm": 0.9140428304672241, + "learning_rate": 3.534996898814743e-06, + "loss": 0.6555, + "step": 5980 + }, + { + "epoch": 2.2078513413733694, + "grad_norm": 0.8949410319328308, + "learning_rate": 3.534554143143198e-06, + "loss": 0.6098, + "step": 5981 + }, + { + "epoch": 2.2082205267044057, + "grad_norm": 0.8830333948135376, + "learning_rate": 3.5341113483141055e-06, + "loss": 0.5619, + "step": 5982 + }, + { + "epoch": 2.2085897120354416, + "grad_norm": 0.8690739870071411, + "learning_rate": 3.533668514344224e-06, + "loss": 0.6363, + "step": 5983 + }, + { + "epoch": 2.208958897366478, + "grad_norm": 0.8787680864334106, + "learning_rate": 3.5332256412503157e-06, + "loss": 0.6372, + "step": 5984 + }, + { + "epoch": 2.2093280826975144, + "grad_norm": 0.8926694989204407, + "learning_rate": 3.532782729049143e-06, + "loss": 0.6294, + "step": 5985 + }, + { + "epoch": 2.2096972680285503, + "grad_norm": 0.8963958024978638, + "learning_rate": 3.5323397777574696e-06, + "loss": 0.6323, + "step": 5986 + }, + { + "epoch": 2.2100664533595866, + "grad_norm": 0.9259573817253113, + "learning_rate": 3.531896787392062e-06, + "loss": 0.6253, + "step": 5987 + }, + { + "epoch": 2.2104356386906225, + "grad_norm": 0.8775355815887451, + "learning_rate": 3.5314537579696866e-06, + "loss": 0.624, + "step": 5988 + }, + { + "epoch": 2.210804824021659, + "grad_norm": 0.8662400245666504, + "learning_rate": 3.5310106895071117e-06, + "loss": 0.6222, + "step": 5989 + }, + { + "epoch": 2.2111740093526953, + "grad_norm": 0.8872251510620117, + "learning_rate": 3.5305675820211077e-06, + "loss": 0.6239, + "step": 5990 + }, + { + "epoch": 2.211543194683731, + "grad_norm": 0.8662833571434021, + "learning_rate": 3.5301244355284458e-06, + "loss": 0.6038, + "step": 5991 + }, + { + "epoch": 2.2119123800147675, + "grad_norm": 0.9062726497650146, + "learning_rate": 3.5296812500459e-06, + "loss": 0.6116, + "step": 5992 + }, + { + "epoch": 2.2122815653458034, + "grad_norm": 0.911340057849884, + "learning_rate": 3.5292380255902424e-06, + "loss": 0.6139, + "step": 5993 + }, + { + "epoch": 2.21265075067684, + "grad_norm": 0.9384115934371948, + "learning_rate": 3.5287947621782516e-06, + "loss": 0.5937, + "step": 5994 + }, + { + "epoch": 2.2130199360078757, + "grad_norm": 0.8443077206611633, + "learning_rate": 3.528351459826703e-06, + "loss": 0.6094, + "step": 5995 + }, + { + "epoch": 2.213389121338912, + "grad_norm": 0.9219194650650024, + "learning_rate": 3.5279081185523766e-06, + "loss": 0.6309, + "step": 5996 + }, + { + "epoch": 2.2137583066699484, + "grad_norm": 0.862703800201416, + "learning_rate": 3.5274647383720523e-06, + "loss": 0.612, + "step": 5997 + }, + { + "epoch": 2.2141274920009844, + "grad_norm": 0.8756140470504761, + "learning_rate": 3.527021319302512e-06, + "loss": 0.6082, + "step": 5998 + }, + { + "epoch": 2.2144966773320207, + "grad_norm": 0.9045357704162598, + "learning_rate": 3.526577861360538e-06, + "loss": 0.6304, + "step": 5999 + }, + { + "epoch": 2.214865862663057, + "grad_norm": 0.8708902597427368, + "learning_rate": 3.5261343645629175e-06, + "loss": 0.6105, + "step": 6000 + }, + { + "epoch": 2.215235047994093, + "grad_norm": 0.9217180013656616, + "learning_rate": 3.525690828926434e-06, + "loss": 0.6411, + "step": 6001 + }, + { + "epoch": 2.2156042333251293, + "grad_norm": 0.874293863773346, + "learning_rate": 3.525247254467877e-06, + "loss": 0.5972, + "step": 6002 + }, + { + "epoch": 2.2159734186561653, + "grad_norm": 0.8875423669815063, + "learning_rate": 3.5248036412040342e-06, + "loss": 0.5895, + "step": 6003 + }, + { + "epoch": 2.2163426039872016, + "grad_norm": 0.903434693813324, + "learning_rate": 3.5243599891516978e-06, + "loss": 0.6145, + "step": 6004 + }, + { + "epoch": 2.2167117893182375, + "grad_norm": 0.8950120806694031, + "learning_rate": 3.5239162983276585e-06, + "loss": 0.6046, + "step": 6005 + }, + { + "epoch": 2.217080974649274, + "grad_norm": 0.8657493591308594, + "learning_rate": 3.523472568748711e-06, + "loss": 0.6093, + "step": 6006 + }, + { + "epoch": 2.2174501599803103, + "grad_norm": 0.8619616031646729, + "learning_rate": 3.523028800431649e-06, + "loss": 0.6086, + "step": 6007 + }, + { + "epoch": 2.217819345311346, + "grad_norm": 0.8962694406509399, + "learning_rate": 3.522584993393271e-06, + "loss": 0.6719, + "step": 6008 + }, + { + "epoch": 2.2181885306423825, + "grad_norm": 0.8792092204093933, + "learning_rate": 3.522141147650373e-06, + "loss": 0.5925, + "step": 6009 + }, + { + "epoch": 2.218557715973419, + "grad_norm": 0.8328500986099243, + "learning_rate": 3.521697263219755e-06, + "loss": 0.6044, + "step": 6010 + }, + { + "epoch": 2.218926901304455, + "grad_norm": 0.9367761015892029, + "learning_rate": 3.521253340118218e-06, + "loss": 0.6289, + "step": 6011 + }, + { + "epoch": 2.219296086635491, + "grad_norm": 0.8808512091636658, + "learning_rate": 3.520809378362565e-06, + "loss": 0.6609, + "step": 6012 + }, + { + "epoch": 2.219665271966527, + "grad_norm": 0.9133322238922119, + "learning_rate": 3.520365377969599e-06, + "loss": 0.6595, + "step": 6013 + }, + { + "epoch": 2.2200344572975634, + "grad_norm": 0.879281222820282, + "learning_rate": 3.519921338956125e-06, + "loss": 0.6142, + "step": 6014 + }, + { + "epoch": 2.2204036426285993, + "grad_norm": 0.8941693902015686, + "learning_rate": 3.519477261338951e-06, + "loss": 0.6487, + "step": 6015 + }, + { + "epoch": 2.2207728279596357, + "grad_norm": 0.9090436697006226, + "learning_rate": 3.5190331451348836e-06, + "loss": 0.6298, + "step": 6016 + }, + { + "epoch": 2.221142013290672, + "grad_norm": 0.8936241865158081, + "learning_rate": 3.5185889903607333e-06, + "loss": 0.6375, + "step": 6017 + }, + { + "epoch": 2.221511198621708, + "grad_norm": 0.8996967077255249, + "learning_rate": 3.5181447970333117e-06, + "loss": 0.6444, + "step": 6018 + }, + { + "epoch": 2.2218803839527443, + "grad_norm": 1.041375994682312, + "learning_rate": 3.5177005651694308e-06, + "loss": 0.6265, + "step": 6019 + }, + { + "epoch": 2.2222495692837803, + "grad_norm": 0.8946433663368225, + "learning_rate": 3.5172562947859045e-06, + "loss": 0.6413, + "step": 6020 + }, + { + "epoch": 2.2226187546148166, + "grad_norm": 0.8586397171020508, + "learning_rate": 3.5168119858995487e-06, + "loss": 0.6078, + "step": 6021 + }, + { + "epoch": 2.222987939945853, + "grad_norm": 0.8904023170471191, + "learning_rate": 3.51636763852718e-06, + "loss": 0.6483, + "step": 6022 + }, + { + "epoch": 2.223357125276889, + "grad_norm": 0.9032081365585327, + "learning_rate": 3.515923252685617e-06, + "loss": 0.6246, + "step": 6023 + }, + { + "epoch": 2.2237263106079252, + "grad_norm": 0.8987188935279846, + "learning_rate": 3.515478828391679e-06, + "loss": 0.6116, + "step": 6024 + }, + { + "epoch": 2.224095495938961, + "grad_norm": 0.8935006260871887, + "learning_rate": 3.5150343656621886e-06, + "loss": 0.6433, + "step": 6025 + }, + { + "epoch": 2.2244646812699975, + "grad_norm": 0.9283369779586792, + "learning_rate": 3.5145898645139676e-06, + "loss": 0.6173, + "step": 6026 + }, + { + "epoch": 2.224833866601034, + "grad_norm": 0.8645320534706116, + "learning_rate": 3.51414532496384e-06, + "loss": 0.5776, + "step": 6027 + }, + { + "epoch": 2.22520305193207, + "grad_norm": 0.9161961078643799, + "learning_rate": 3.513700747028632e-06, + "loss": 0.5954, + "step": 6028 + }, + { + "epoch": 2.225572237263106, + "grad_norm": 0.8955650925636292, + "learning_rate": 3.513256130725171e-06, + "loss": 0.6494, + "step": 6029 + }, + { + "epoch": 2.225941422594142, + "grad_norm": 0.9036319255828857, + "learning_rate": 3.5128114760702853e-06, + "loss": 0.6402, + "step": 6030 + }, + { + "epoch": 2.2263106079251784, + "grad_norm": 0.8702560067176819, + "learning_rate": 3.512366783080805e-06, + "loss": 0.5835, + "step": 6031 + }, + { + "epoch": 2.226679793256215, + "grad_norm": 0.9102113246917725, + "learning_rate": 3.5119220517735616e-06, + "loss": 0.6286, + "step": 6032 + }, + { + "epoch": 2.2270489785872507, + "grad_norm": 0.8843192458152771, + "learning_rate": 3.511477282165387e-06, + "loss": 0.5716, + "step": 6033 + }, + { + "epoch": 2.227418163918287, + "grad_norm": 0.897646963596344, + "learning_rate": 3.511032474273117e-06, + "loss": 0.6431, + "step": 6034 + }, + { + "epoch": 2.227787349249323, + "grad_norm": 0.9014382362365723, + "learning_rate": 3.510587628113587e-06, + "loss": 0.6028, + "step": 6035 + }, + { + "epoch": 2.2281565345803593, + "grad_norm": 0.8870620727539062, + "learning_rate": 3.510142743703634e-06, + "loss": 0.6561, + "step": 6036 + }, + { + "epoch": 2.2285257199113957, + "grad_norm": 0.8933110237121582, + "learning_rate": 3.5096978210600975e-06, + "loss": 0.6139, + "step": 6037 + }, + { + "epoch": 2.2288949052424316, + "grad_norm": 0.8616639375686646, + "learning_rate": 3.5092528601998165e-06, + "loss": 0.5918, + "step": 6038 + }, + { + "epoch": 2.229264090573468, + "grad_norm": 0.8636903166770935, + "learning_rate": 3.5088078611396336e-06, + "loss": 0.5847, + "step": 6039 + }, + { + "epoch": 2.229633275904504, + "grad_norm": 0.9131521582603455, + "learning_rate": 3.5083628238963913e-06, + "loss": 0.6675, + "step": 6040 + }, + { + "epoch": 2.2300024612355402, + "grad_norm": 0.9132584929466248, + "learning_rate": 3.507917748486934e-06, + "loss": 0.6119, + "step": 6041 + }, + { + "epoch": 2.2303716465665766, + "grad_norm": 0.8773882389068604, + "learning_rate": 3.5074726349281083e-06, + "loss": 0.6166, + "step": 6042 + }, + { + "epoch": 2.2307408318976125, + "grad_norm": 0.8955878615379333, + "learning_rate": 3.5070274832367625e-06, + "loss": 0.6043, + "step": 6043 + }, + { + "epoch": 2.231110017228649, + "grad_norm": 0.8771915435791016, + "learning_rate": 3.5065822934297435e-06, + "loss": 0.613, + "step": 6044 + }, + { + "epoch": 2.231479202559685, + "grad_norm": 0.8890005350112915, + "learning_rate": 3.5061370655239024e-06, + "loss": 0.6352, + "step": 6045 + }, + { + "epoch": 2.231848387890721, + "grad_norm": 0.8732471466064453, + "learning_rate": 3.5056917995360907e-06, + "loss": 0.5996, + "step": 6046 + }, + { + "epoch": 2.2322175732217575, + "grad_norm": 0.8901222944259644, + "learning_rate": 3.5052464954831613e-06, + "loss": 0.5912, + "step": 6047 + }, + { + "epoch": 2.2325867585527934, + "grad_norm": 0.9037148356437683, + "learning_rate": 3.5048011533819697e-06, + "loss": 0.5986, + "step": 6048 + }, + { + "epoch": 2.23295594388383, + "grad_norm": 0.8515931367874146, + "learning_rate": 3.5043557732493723e-06, + "loss": 0.6457, + "step": 6049 + }, + { + "epoch": 2.2333251292148657, + "grad_norm": 0.8494996428489685, + "learning_rate": 3.5039103551022246e-06, + "loss": 0.5708, + "step": 6050 + }, + { + "epoch": 2.233694314545902, + "grad_norm": 0.8951349258422852, + "learning_rate": 3.5034648989573877e-06, + "loss": 0.618, + "step": 6051 + }, + { + "epoch": 2.2340634998769384, + "grad_norm": 0.9084248542785645, + "learning_rate": 3.5030194048317205e-06, + "loss": 0.6318, + "step": 6052 + }, + { + "epoch": 2.2344326852079743, + "grad_norm": 0.8931682705879211, + "learning_rate": 3.5025738727420866e-06, + "loss": 0.605, + "step": 6053 + }, + { + "epoch": 2.2348018705390107, + "grad_norm": 0.8737742304801941, + "learning_rate": 3.5021283027053466e-06, + "loss": 0.6364, + "step": 6054 + }, + { + "epoch": 2.2351710558700466, + "grad_norm": 0.8877624869346619, + "learning_rate": 3.501682694738367e-06, + "loss": 0.626, + "step": 6055 + }, + { + "epoch": 2.235540241201083, + "grad_norm": 0.8823633193969727, + "learning_rate": 3.501237048858014e-06, + "loss": 0.5934, + "step": 6056 + }, + { + "epoch": 2.2359094265321193, + "grad_norm": 0.9035148620605469, + "learning_rate": 3.500791365081154e-06, + "loss": 0.6198, + "step": 6057 + }, + { + "epoch": 2.2362786118631552, + "grad_norm": 0.8867987990379333, + "learning_rate": 3.5003456434246573e-06, + "loss": 0.6334, + "step": 6058 + }, + { + "epoch": 2.2366477971941916, + "grad_norm": 0.8909644484519958, + "learning_rate": 3.4998998839053938e-06, + "loss": 0.584, + "step": 6059 + }, + { + "epoch": 2.2370169825252275, + "grad_norm": 0.8742733001708984, + "learning_rate": 3.499454086540235e-06, + "loss": 0.6522, + "step": 6060 + }, + { + "epoch": 2.237386167856264, + "grad_norm": 0.8883030414581299, + "learning_rate": 3.4990082513460543e-06, + "loss": 0.6605, + "step": 6061 + }, + { + "epoch": 2.2377553531873, + "grad_norm": 0.8958737850189209, + "learning_rate": 3.4985623783397267e-06, + "loss": 0.6178, + "step": 6062 + }, + { + "epoch": 2.238124538518336, + "grad_norm": 0.8951502442359924, + "learning_rate": 3.4981164675381286e-06, + "loss": 0.5759, + "step": 6063 + }, + { + "epoch": 2.2384937238493725, + "grad_norm": 0.8910221457481384, + "learning_rate": 3.4976705189581365e-06, + "loss": 0.6407, + "step": 6064 + }, + { + "epoch": 2.2388629091804084, + "grad_norm": 0.8670762777328491, + "learning_rate": 3.4972245326166305e-06, + "loss": 0.6071, + "step": 6065 + }, + { + "epoch": 2.239232094511445, + "grad_norm": 0.8913832902908325, + "learning_rate": 3.4967785085304905e-06, + "loss": 0.6531, + "step": 6066 + }, + { + "epoch": 2.239601279842481, + "grad_norm": 0.8784464597702026, + "learning_rate": 3.496332446716599e-06, + "loss": 0.6196, + "step": 6067 + }, + { + "epoch": 2.239970465173517, + "grad_norm": 0.9189581274986267, + "learning_rate": 3.495886347191838e-06, + "loss": 0.6193, + "step": 6068 + }, + { + "epoch": 2.2403396505045534, + "grad_norm": 0.9074673056602478, + "learning_rate": 3.4954402099730944e-06, + "loss": 0.6176, + "step": 6069 + }, + { + "epoch": 2.2407088358355893, + "grad_norm": 0.9072771072387695, + "learning_rate": 3.494994035077252e-06, + "loss": 0.6185, + "step": 6070 + }, + { + "epoch": 2.2410780211666257, + "grad_norm": 0.9044420719146729, + "learning_rate": 3.4945478225211995e-06, + "loss": 0.6272, + "step": 6071 + }, + { + "epoch": 2.2414472064976616, + "grad_norm": 0.8821624517440796, + "learning_rate": 3.494101572321826e-06, + "loss": 0.6244, + "step": 6072 + }, + { + "epoch": 2.241816391828698, + "grad_norm": 0.9147587418556213, + "learning_rate": 3.4936552844960213e-06, + "loss": 0.621, + "step": 6073 + }, + { + "epoch": 2.2421855771597343, + "grad_norm": 0.8667396306991577, + "learning_rate": 3.4932089590606783e-06, + "loss": 0.6196, + "step": 6074 + }, + { + "epoch": 2.2425547624907702, + "grad_norm": 0.8867070078849792, + "learning_rate": 3.4927625960326892e-06, + "loss": 0.6281, + "step": 6075 + }, + { + "epoch": 2.2429239478218066, + "grad_norm": 0.9203586578369141, + "learning_rate": 3.4923161954289493e-06, + "loss": 0.6107, + "step": 6076 + }, + { + "epoch": 2.243293133152843, + "grad_norm": 0.8638989329338074, + "learning_rate": 3.4918697572663553e-06, + "loss": 0.6018, + "step": 6077 + }, + { + "epoch": 2.243662318483879, + "grad_norm": 0.9101759791374207, + "learning_rate": 3.491423281561803e-06, + "loss": 0.6365, + "step": 6078 + }, + { + "epoch": 2.2440315038149152, + "grad_norm": 0.8990928530693054, + "learning_rate": 3.4909767683321925e-06, + "loss": 0.6214, + "step": 6079 + }, + { + "epoch": 2.244400689145951, + "grad_norm": 0.8945391774177551, + "learning_rate": 3.4905302175944237e-06, + "loss": 0.6623, + "step": 6080 + }, + { + "epoch": 2.2447698744769875, + "grad_norm": 0.9361270666122437, + "learning_rate": 3.4900836293654e-06, + "loss": 0.6373, + "step": 6081 + }, + { + "epoch": 2.2451390598080234, + "grad_norm": 0.8916853070259094, + "learning_rate": 3.4896370036620233e-06, + "loss": 0.5644, + "step": 6082 + }, + { + "epoch": 2.2455082451390598, + "grad_norm": 0.8757670521736145, + "learning_rate": 3.489190340501198e-06, + "loss": 0.6518, + "step": 6083 + }, + { + "epoch": 2.245877430470096, + "grad_norm": 0.8949745297431946, + "learning_rate": 3.48874363989983e-06, + "loss": 0.5977, + "step": 6084 + }, + { + "epoch": 2.246246615801132, + "grad_norm": 0.8614469766616821, + "learning_rate": 3.4882969018748277e-06, + "loss": 0.6213, + "step": 6085 + }, + { + "epoch": 2.2466158011321684, + "grad_norm": 0.8981713652610779, + "learning_rate": 3.4878501264430993e-06, + "loss": 0.6509, + "step": 6086 + }, + { + "epoch": 2.2469849864632043, + "grad_norm": 0.9468830227851868, + "learning_rate": 3.4874033136215556e-06, + "loss": 0.6633, + "step": 6087 + }, + { + "epoch": 2.2473541717942407, + "grad_norm": 0.8892241716384888, + "learning_rate": 3.486956463427108e-06, + "loss": 0.6581, + "step": 6088 + }, + { + "epoch": 2.247723357125277, + "grad_norm": 0.8693100810050964, + "learning_rate": 3.4865095758766703e-06, + "loss": 0.6309, + "step": 6089 + }, + { + "epoch": 2.248092542456313, + "grad_norm": 0.8690667152404785, + "learning_rate": 3.4860626509871553e-06, + "loss": 0.6148, + "step": 6090 + }, + { + "epoch": 2.2484617277873493, + "grad_norm": 0.9025923609733582, + "learning_rate": 3.485615688775481e-06, + "loss": 0.6416, + "step": 6091 + }, + { + "epoch": 2.2488309131183852, + "grad_norm": 0.8867904543876648, + "learning_rate": 3.485168689258564e-06, + "loss": 0.6185, + "step": 6092 + }, + { + "epoch": 2.2492000984494216, + "grad_norm": 0.9031831622123718, + "learning_rate": 3.484721652453323e-06, + "loss": 0.6179, + "step": 6093 + }, + { + "epoch": 2.249569283780458, + "grad_norm": 0.8808794021606445, + "learning_rate": 3.484274578376678e-06, + "loss": 0.6145, + "step": 6094 + }, + { + "epoch": 2.249938469111494, + "grad_norm": 0.8772534728050232, + "learning_rate": 3.483827467045551e-06, + "loss": 0.5975, + "step": 6095 + }, + { + "epoch": 2.2503076544425302, + "grad_norm": 0.9028250575065613, + "learning_rate": 3.483380318476865e-06, + "loss": 0.6124, + "step": 6096 + }, + { + "epoch": 2.250676839773566, + "grad_norm": 0.8762035369873047, + "learning_rate": 3.4829331326875443e-06, + "loss": 0.6571, + "step": 6097 + }, + { + "epoch": 2.2510460251046025, + "grad_norm": 0.8827697038650513, + "learning_rate": 3.4824859096945146e-06, + "loss": 0.6183, + "step": 6098 + }, + { + "epoch": 2.251415210435639, + "grad_norm": 0.842019259929657, + "learning_rate": 3.4820386495147027e-06, + "loss": 0.6031, + "step": 6099 + }, + { + "epoch": 2.2517843957666748, + "grad_norm": 0.8908863067626953, + "learning_rate": 3.481591352165039e-06, + "loss": 0.5949, + "step": 6100 + }, + { + "epoch": 2.252153581097711, + "grad_norm": 0.90083247423172, + "learning_rate": 3.481144017662452e-06, + "loss": 0.6318, + "step": 6101 + }, + { + "epoch": 2.252522766428747, + "grad_norm": 0.8866798877716064, + "learning_rate": 3.4806966460238735e-06, + "loss": 0.6055, + "step": 6102 + }, + { + "epoch": 2.2528919517597834, + "grad_norm": 0.9193224906921387, + "learning_rate": 3.4802492372662368e-06, + "loss": 0.6319, + "step": 6103 + }, + { + "epoch": 2.2532611370908198, + "grad_norm": 0.9311390519142151, + "learning_rate": 3.4798017914064745e-06, + "loss": 0.5751, + "step": 6104 + }, + { + "epoch": 2.2536303224218557, + "grad_norm": 0.8832557201385498, + "learning_rate": 3.479354308461525e-06, + "loss": 0.5823, + "step": 6105 + }, + { + "epoch": 2.253999507752892, + "grad_norm": 0.9057480692863464, + "learning_rate": 3.4789067884483234e-06, + "loss": 0.6001, + "step": 6106 + }, + { + "epoch": 2.254368693083928, + "grad_norm": 0.9054129719734192, + "learning_rate": 3.4784592313838095e-06, + "loss": 0.6344, + "step": 6107 + }, + { + "epoch": 2.2547378784149643, + "grad_norm": 0.9185937643051147, + "learning_rate": 3.4780116372849217e-06, + "loss": 0.6045, + "step": 6108 + }, + { + "epoch": 2.2551070637460007, + "grad_norm": 0.8616063594818115, + "learning_rate": 3.4775640061686024e-06, + "loss": 0.6359, + "step": 6109 + }, + { + "epoch": 2.2554762490770366, + "grad_norm": 0.8574202060699463, + "learning_rate": 3.4771163380517934e-06, + "loss": 0.614, + "step": 6110 + }, + { + "epoch": 2.255845434408073, + "grad_norm": 0.8877606391906738, + "learning_rate": 3.476668632951441e-06, + "loss": 0.6002, + "step": 6111 + }, + { + "epoch": 2.256214619739109, + "grad_norm": 0.926651656627655, + "learning_rate": 3.4762208908844875e-06, + "loss": 0.6075, + "step": 6112 + }, + { + "epoch": 2.256583805070145, + "grad_norm": 0.8699544668197632, + "learning_rate": 3.4757731118678816e-06, + "loss": 0.5915, + "step": 6113 + }, + { + "epoch": 2.2569529904011816, + "grad_norm": 0.8573347926139832, + "learning_rate": 3.4753252959185716e-06, + "loss": 0.623, + "step": 6114 + }, + { + "epoch": 2.2573221757322175, + "grad_norm": 0.8650701642036438, + "learning_rate": 3.4748774430535083e-06, + "loss": 0.6281, + "step": 6115 + }, + { + "epoch": 2.257691361063254, + "grad_norm": 0.8912199139595032, + "learning_rate": 3.47442955328964e-06, + "loss": 0.5902, + "step": 6116 + }, + { + "epoch": 2.2580605463942898, + "grad_norm": 0.8910509347915649, + "learning_rate": 3.473981626643921e-06, + "loss": 0.6214, + "step": 6117 + }, + { + "epoch": 2.258429731725326, + "grad_norm": 0.9053081274032593, + "learning_rate": 3.473533663133304e-06, + "loss": 0.6511, + "step": 6118 + }, + { + "epoch": 2.2587989170563625, + "grad_norm": 0.8889337778091431, + "learning_rate": 3.473085662774746e-06, + "loss": 0.6495, + "step": 6119 + }, + { + "epoch": 2.2591681023873984, + "grad_norm": 0.889898419380188, + "learning_rate": 3.4726376255852024e-06, + "loss": 0.625, + "step": 6120 + }, + { + "epoch": 2.2595372877184348, + "grad_norm": 0.8666785359382629, + "learning_rate": 3.4721895515816324e-06, + "loss": 0.5906, + "step": 6121 + }, + { + "epoch": 2.2599064730494707, + "grad_norm": 0.8980162739753723, + "learning_rate": 3.471741440780994e-06, + "loss": 0.5917, + "step": 6122 + }, + { + "epoch": 2.260275658380507, + "grad_norm": 0.8483160734176636, + "learning_rate": 3.4712932932002486e-06, + "loss": 0.6038, + "step": 6123 + }, + { + "epoch": 2.2606448437115434, + "grad_norm": 0.9220544099807739, + "learning_rate": 3.470845108856359e-06, + "loss": 0.6213, + "step": 6124 + }, + { + "epoch": 2.2610140290425793, + "grad_norm": 0.8637959957122803, + "learning_rate": 3.4703968877662888e-06, + "loss": 0.5999, + "step": 6125 + }, + { + "epoch": 2.2613832143736157, + "grad_norm": 0.9015430212020874, + "learning_rate": 3.4699486299470016e-06, + "loss": 0.6281, + "step": 6126 + }, + { + "epoch": 2.2617523997046516, + "grad_norm": 0.9048865437507629, + "learning_rate": 3.469500335415466e-06, + "loss": 0.6055, + "step": 6127 + }, + { + "epoch": 2.262121585035688, + "grad_norm": 0.8631407022476196, + "learning_rate": 3.4690520041886473e-06, + "loss": 0.6126, + "step": 6128 + }, + { + "epoch": 2.262490770366724, + "grad_norm": 0.9486720561981201, + "learning_rate": 3.4686036362835174e-06, + "loss": 0.6301, + "step": 6129 + }, + { + "epoch": 2.26285995569776, + "grad_norm": 0.8906315565109253, + "learning_rate": 3.4681552317170447e-06, + "loss": 0.5825, + "step": 6130 + }, + { + "epoch": 2.2632291410287966, + "grad_norm": 0.8989232778549194, + "learning_rate": 3.4677067905062027e-06, + "loss": 0.6294, + "step": 6131 + }, + { + "epoch": 2.2635983263598325, + "grad_norm": 0.9114752411842346, + "learning_rate": 3.467258312667963e-06, + "loss": 0.628, + "step": 6132 + }, + { + "epoch": 2.263967511690869, + "grad_norm": 0.8786404132843018, + "learning_rate": 3.466809798219303e-06, + "loss": 0.614, + "step": 6133 + }, + { + "epoch": 2.264336697021905, + "grad_norm": 0.901385486125946, + "learning_rate": 3.4663612471771964e-06, + "loss": 0.5788, + "step": 6134 + }, + { + "epoch": 2.264705882352941, + "grad_norm": 0.8819867372512817, + "learning_rate": 3.4659126595586216e-06, + "loss": 0.6868, + "step": 6135 + }, + { + "epoch": 2.2650750676839775, + "grad_norm": 0.8765539526939392, + "learning_rate": 3.465464035380558e-06, + "loss": 0.6129, + "step": 6136 + }, + { + "epoch": 2.2654442530150134, + "grad_norm": 0.8939322829246521, + "learning_rate": 3.4650153746599847e-06, + "loss": 0.6285, + "step": 6137 + }, + { + "epoch": 2.2658134383460498, + "grad_norm": 0.8900436162948608, + "learning_rate": 3.464566677413884e-06, + "loss": 0.6036, + "step": 6138 + }, + { + "epoch": 2.2661826236770857, + "grad_norm": 0.8905470371246338, + "learning_rate": 3.46411794365924e-06, + "loss": 0.5916, + "step": 6139 + }, + { + "epoch": 2.266551809008122, + "grad_norm": 0.9252784848213196, + "learning_rate": 3.4636691734130347e-06, + "loss": 0.6288, + "step": 6140 + }, + { + "epoch": 2.2669209943391584, + "grad_norm": 0.8990365266799927, + "learning_rate": 3.4632203666922564e-06, + "loss": 0.6157, + "step": 6141 + }, + { + "epoch": 2.2672901796701943, + "grad_norm": 0.8826862573623657, + "learning_rate": 3.4627715235138905e-06, + "loss": 0.6095, + "step": 6142 + }, + { + "epoch": 2.2676593650012307, + "grad_norm": 0.8898972272872925, + "learning_rate": 3.4623226438949263e-06, + "loss": 0.627, + "step": 6143 + }, + { + "epoch": 2.268028550332267, + "grad_norm": 0.8757809996604919, + "learning_rate": 3.461873727852354e-06, + "loss": 0.622, + "step": 6144 + }, + { + "epoch": 2.268397735663303, + "grad_norm": 0.8849049210548401, + "learning_rate": 3.461424775403165e-06, + "loss": 0.6303, + "step": 6145 + }, + { + "epoch": 2.2687669209943393, + "grad_norm": 0.8821909427642822, + "learning_rate": 3.460975786564351e-06, + "loss": 0.593, + "step": 6146 + }, + { + "epoch": 2.269136106325375, + "grad_norm": 0.8602946400642395, + "learning_rate": 3.4605267613529075e-06, + "loss": 0.6109, + "step": 6147 + }, + { + "epoch": 2.2695052916564116, + "grad_norm": 0.8861927390098572, + "learning_rate": 3.4600776997858287e-06, + "loss": 0.5705, + "step": 6148 + }, + { + "epoch": 2.2698744769874475, + "grad_norm": 0.8753241300582886, + "learning_rate": 3.4596286018801127e-06, + "loss": 0.6356, + "step": 6149 + }, + { + "epoch": 2.270243662318484, + "grad_norm": 0.8934397101402283, + "learning_rate": 3.459179467652756e-06, + "loss": 0.6652, + "step": 6150 + }, + { + "epoch": 2.27061284764952, + "grad_norm": 0.8745578527450562, + "learning_rate": 3.458730297120759e-06, + "loss": 0.6088, + "step": 6151 + }, + { + "epoch": 2.270982032980556, + "grad_norm": 0.8912252187728882, + "learning_rate": 3.458281090301124e-06, + "loss": 0.6448, + "step": 6152 + }, + { + "epoch": 2.2713512183115925, + "grad_norm": 0.9013225436210632, + "learning_rate": 3.4578318472108517e-06, + "loss": 0.6191, + "step": 6153 + }, + { + "epoch": 2.271720403642629, + "grad_norm": 0.8572041988372803, + "learning_rate": 3.457382567866946e-06, + "loss": 0.6487, + "step": 6154 + }, + { + "epoch": 2.2720895889736648, + "grad_norm": 0.8971853852272034, + "learning_rate": 3.456933252286413e-06, + "loss": 0.6253, + "step": 6155 + }, + { + "epoch": 2.272458774304701, + "grad_norm": 0.9146037697792053, + "learning_rate": 3.456483900486257e-06, + "loss": 0.634, + "step": 6156 + }, + { + "epoch": 2.272827959635737, + "grad_norm": 0.8906471133232117, + "learning_rate": 3.456034512483489e-06, + "loss": 0.6398, + "step": 6157 + }, + { + "epoch": 2.2731971449667734, + "grad_norm": 0.9582616090774536, + "learning_rate": 3.455585088295115e-06, + "loss": 0.6016, + "step": 6158 + }, + { + "epoch": 2.2735663302978093, + "grad_norm": 0.8742156028747559, + "learning_rate": 3.4551356279381478e-06, + "loss": 0.6243, + "step": 6159 + }, + { + "epoch": 2.2739355156288457, + "grad_norm": 0.8790980577468872, + "learning_rate": 3.454686131429598e-06, + "loss": 0.6682, + "step": 6160 + }, + { + "epoch": 2.274304700959882, + "grad_norm": 0.8720006346702576, + "learning_rate": 3.45423659878648e-06, + "loss": 0.6169, + "step": 6161 + }, + { + "epoch": 2.274673886290918, + "grad_norm": 0.8709516525268555, + "learning_rate": 3.453787030025807e-06, + "loss": 0.5987, + "step": 6162 + }, + { + "epoch": 2.2750430716219543, + "grad_norm": 0.9073929786682129, + "learning_rate": 3.453337425164597e-06, + "loss": 0.6493, + "step": 6163 + }, + { + "epoch": 2.27541225695299, + "grad_norm": 0.8919942378997803, + "learning_rate": 3.452887784219866e-06, + "loss": 0.6211, + "step": 6164 + }, + { + "epoch": 2.2757814422840266, + "grad_norm": 0.8934463262557983, + "learning_rate": 3.4524381072086327e-06, + "loss": 0.6351, + "step": 6165 + }, + { + "epoch": 2.276150627615063, + "grad_norm": 0.8640508055686951, + "learning_rate": 3.4519883941479176e-06, + "loss": 0.6089, + "step": 6166 + }, + { + "epoch": 2.276519812946099, + "grad_norm": 0.9026186466217041, + "learning_rate": 3.4515386450547427e-06, + "loss": 0.582, + "step": 6167 + }, + { + "epoch": 2.276888998277135, + "grad_norm": 0.9106242656707764, + "learning_rate": 3.45108885994613e-06, + "loss": 0.6382, + "step": 6168 + }, + { + "epoch": 2.277258183608171, + "grad_norm": 0.8976107239723206, + "learning_rate": 3.4506390388391047e-06, + "loss": 0.6422, + "step": 6169 + }, + { + "epoch": 2.2776273689392075, + "grad_norm": 0.8871420621871948, + "learning_rate": 3.450189181750691e-06, + "loss": 0.6021, + "step": 6170 + }, + { + "epoch": 2.277996554270244, + "grad_norm": 0.8628703951835632, + "learning_rate": 3.4497392886979174e-06, + "loss": 0.5995, + "step": 6171 + }, + { + "epoch": 2.2783657396012797, + "grad_norm": 0.8706042170524597, + "learning_rate": 3.449289359697811e-06, + "loss": 0.6377, + "step": 6172 + }, + { + "epoch": 2.278734924932316, + "grad_norm": 0.8817248344421387, + "learning_rate": 3.4488393947674024e-06, + "loss": 0.6332, + "step": 6173 + }, + { + "epoch": 2.279104110263352, + "grad_norm": 0.8981207609176636, + "learning_rate": 3.448389393923722e-06, + "loss": 0.5752, + "step": 6174 + }, + { + "epoch": 2.2794732955943884, + "grad_norm": 0.8878105282783508, + "learning_rate": 3.4479393571838023e-06, + "loss": 0.6207, + "step": 6175 + }, + { + "epoch": 2.2798424809254247, + "grad_norm": 0.8987954258918762, + "learning_rate": 3.4474892845646767e-06, + "loss": 0.6307, + "step": 6176 + }, + { + "epoch": 2.2802116662564607, + "grad_norm": 0.8861002922058105, + "learning_rate": 3.447039176083381e-06, + "loss": 0.6309, + "step": 6177 + }, + { + "epoch": 2.280580851587497, + "grad_norm": 0.8976836204528809, + "learning_rate": 3.4465890317569517e-06, + "loss": 0.6618, + "step": 6178 + }, + { + "epoch": 2.280950036918533, + "grad_norm": 0.9022141098976135, + "learning_rate": 3.4461388516024263e-06, + "loss": 0.6344, + "step": 6179 + }, + { + "epoch": 2.2813192222495693, + "grad_norm": 0.891180694103241, + "learning_rate": 3.445688635636843e-06, + "loss": 0.6064, + "step": 6180 + }, + { + "epoch": 2.2816884075806056, + "grad_norm": 0.8891002535820007, + "learning_rate": 3.445238383877244e-06, + "loss": 0.6481, + "step": 6181 + }, + { + "epoch": 2.2820575929116416, + "grad_norm": 0.8967509269714355, + "learning_rate": 3.4447880963406704e-06, + "loss": 0.648, + "step": 6182 + }, + { + "epoch": 2.282426778242678, + "grad_norm": 0.909264326095581, + "learning_rate": 3.4443377730441663e-06, + "loss": 0.6108, + "step": 6183 + }, + { + "epoch": 2.282795963573714, + "grad_norm": 0.8770445585250854, + "learning_rate": 3.443887414004774e-06, + "loss": 0.6092, + "step": 6184 + }, + { + "epoch": 2.28316514890475, + "grad_norm": 0.8717084527015686, + "learning_rate": 3.4434370192395423e-06, + "loss": 0.6257, + "step": 6185 + }, + { + "epoch": 2.2835343342357866, + "grad_norm": 0.9157881736755371, + "learning_rate": 3.4429865887655166e-06, + "loss": 0.6031, + "step": 6186 + }, + { + "epoch": 2.2839035195668225, + "grad_norm": 0.8621339797973633, + "learning_rate": 3.442536122599747e-06, + "loss": 0.6169, + "step": 6187 + }, + { + "epoch": 2.284272704897859, + "grad_norm": 0.8910555839538574, + "learning_rate": 3.442085620759281e-06, + "loss": 0.6309, + "step": 6188 + }, + { + "epoch": 2.2846418902288947, + "grad_norm": 0.8977258205413818, + "learning_rate": 3.441635083261172e-06, + "loss": 0.6369, + "step": 6189 + }, + { + "epoch": 2.285011075559931, + "grad_norm": 0.8595260977745056, + "learning_rate": 3.4411845101224726e-06, + "loss": 0.6022, + "step": 6190 + }, + { + "epoch": 2.2853802608909675, + "grad_norm": 0.9148237109184265, + "learning_rate": 3.4407339013602376e-06, + "loss": 0.6331, + "step": 6191 + }, + { + "epoch": 2.2857494462220034, + "grad_norm": 0.8993155360221863, + "learning_rate": 3.44028325699152e-06, + "loss": 0.6136, + "step": 6192 + }, + { + "epoch": 2.2861186315530397, + "grad_norm": 0.8950616717338562, + "learning_rate": 3.4398325770333786e-06, + "loss": 0.6693, + "step": 6193 + }, + { + "epoch": 2.2864878168840757, + "grad_norm": 0.9131431579589844, + "learning_rate": 3.4393818615028697e-06, + "loss": 0.6108, + "step": 6194 + }, + { + "epoch": 2.286857002215112, + "grad_norm": 0.9119157195091248, + "learning_rate": 3.438931110417055e-06, + "loss": 0.6105, + "step": 6195 + }, + { + "epoch": 2.287226187546148, + "grad_norm": 0.859169065952301, + "learning_rate": 3.4384803237929936e-06, + "loss": 0.5929, + "step": 6196 + }, + { + "epoch": 2.2875953728771843, + "grad_norm": 0.8879808187484741, + "learning_rate": 3.4380295016477482e-06, + "loss": 0.635, + "step": 6197 + }, + { + "epoch": 2.2879645582082206, + "grad_norm": 0.8932011127471924, + "learning_rate": 3.437578643998382e-06, + "loss": 0.6367, + "step": 6198 + }, + { + "epoch": 2.2883337435392566, + "grad_norm": 0.8757479190826416, + "learning_rate": 3.4371277508619604e-06, + "loss": 0.613, + "step": 6199 + }, + { + "epoch": 2.288702928870293, + "grad_norm": 0.9067786931991577, + "learning_rate": 3.436676822255549e-06, + "loss": 0.611, + "step": 6200 + }, + { + "epoch": 2.2890721142013293, + "grad_norm": 0.892240047454834, + "learning_rate": 3.436225858196216e-06, + "loss": 0.6296, + "step": 6201 + }, + { + "epoch": 2.289441299532365, + "grad_norm": 0.922770619392395, + "learning_rate": 3.435774858701029e-06, + "loss": 0.6151, + "step": 6202 + }, + { + "epoch": 2.2898104848634016, + "grad_norm": 0.8978666067123413, + "learning_rate": 3.435323823787059e-06, + "loss": 0.6324, + "step": 6203 + }, + { + "epoch": 2.2901796701944375, + "grad_norm": 0.8615928292274475, + "learning_rate": 3.4348727534713777e-06, + "loss": 0.5828, + "step": 6204 + }, + { + "epoch": 2.290548855525474, + "grad_norm": 0.9147198796272278, + "learning_rate": 3.434421647771058e-06, + "loss": 0.6527, + "step": 6205 + }, + { + "epoch": 2.2909180408565097, + "grad_norm": 0.891470730304718, + "learning_rate": 3.4339705067031733e-06, + "loss": 0.6459, + "step": 6206 + }, + { + "epoch": 2.291287226187546, + "grad_norm": 0.9031770825386047, + "learning_rate": 3.4335193302848003e-06, + "loss": 0.6465, + "step": 6207 + }, + { + "epoch": 2.2916564115185825, + "grad_norm": 0.84893798828125, + "learning_rate": 3.433068118533014e-06, + "loss": 0.6014, + "step": 6208 + }, + { + "epoch": 2.2920255968496184, + "grad_norm": 0.8868657350540161, + "learning_rate": 3.432616871464895e-06, + "loss": 0.6505, + "step": 6209 + }, + { + "epoch": 2.2923947821806547, + "grad_norm": 0.8960661292076111, + "learning_rate": 3.4321655890975214e-06, + "loss": 0.6263, + "step": 6210 + }, + { + "epoch": 2.292763967511691, + "grad_norm": 0.8795142769813538, + "learning_rate": 3.4317142714479753e-06, + "loss": 0.6148, + "step": 6211 + }, + { + "epoch": 2.293133152842727, + "grad_norm": 1.2894729375839233, + "learning_rate": 3.431262918533337e-06, + "loss": 0.607, + "step": 6212 + }, + { + "epoch": 2.2935023381737634, + "grad_norm": 0.8766537308692932, + "learning_rate": 3.430811530370691e-06, + "loss": 0.6235, + "step": 6213 + }, + { + "epoch": 2.2938715235047993, + "grad_norm": 0.8993602395057678, + "learning_rate": 3.430360106977123e-06, + "loss": 0.6342, + "step": 6214 + }, + { + "epoch": 2.2942407088358356, + "grad_norm": 0.9271663427352905, + "learning_rate": 3.4299086483697185e-06, + "loss": 0.6283, + "step": 6215 + }, + { + "epoch": 2.2946098941668716, + "grad_norm": 0.8612810373306274, + "learning_rate": 3.4294571545655653e-06, + "loss": 0.5885, + "step": 6216 + }, + { + "epoch": 2.294979079497908, + "grad_norm": 0.8774402141571045, + "learning_rate": 3.429005625581752e-06, + "loss": 0.6359, + "step": 6217 + }, + { + "epoch": 2.2953482648289443, + "grad_norm": 0.8979935646057129, + "learning_rate": 3.4285540614353684e-06, + "loss": 0.6157, + "step": 6218 + }, + { + "epoch": 2.29571745015998, + "grad_norm": 0.8893808126449585, + "learning_rate": 3.428102462143508e-06, + "loss": 0.6375, + "step": 6219 + }, + { + "epoch": 2.2960866354910165, + "grad_norm": 0.9148460626602173, + "learning_rate": 3.4276508277232605e-06, + "loss": 0.5941, + "step": 6220 + }, + { + "epoch": 2.296455820822053, + "grad_norm": 0.8931499123573303, + "learning_rate": 3.4271991581917235e-06, + "loss": 0.6037, + "step": 6221 + }, + { + "epoch": 2.296825006153089, + "grad_norm": 0.9357686042785645, + "learning_rate": 3.4267474535659893e-06, + "loss": 0.6145, + "step": 6222 + }, + { + "epoch": 2.297194191484125, + "grad_norm": 0.870486855506897, + "learning_rate": 3.426295713863158e-06, + "loss": 0.6278, + "step": 6223 + }, + { + "epoch": 2.297563376815161, + "grad_norm": 0.8853832483291626, + "learning_rate": 3.425843939100326e-06, + "loss": 0.644, + "step": 6224 + }, + { + "epoch": 2.2979325621461975, + "grad_norm": 0.8614327311515808, + "learning_rate": 3.4253921292945934e-06, + "loss": 0.6364, + "step": 6225 + }, + { + "epoch": 2.2983017474772334, + "grad_norm": 0.9016979932785034, + "learning_rate": 3.42494028446306e-06, + "loss": 0.6052, + "step": 6226 + }, + { + "epoch": 2.2986709328082697, + "grad_norm": 0.8832420110702515, + "learning_rate": 3.4244884046228287e-06, + "loss": 0.6332, + "step": 6227 + }, + { + "epoch": 2.299040118139306, + "grad_norm": 0.8864814043045044, + "learning_rate": 3.4240364897910033e-06, + "loss": 0.6191, + "step": 6228 + }, + { + "epoch": 2.299409303470342, + "grad_norm": 0.9074990153312683, + "learning_rate": 3.423584539984689e-06, + "loss": 0.617, + "step": 6229 + }, + { + "epoch": 2.2997784888013784, + "grad_norm": 0.8765585422515869, + "learning_rate": 3.4231325552209904e-06, + "loss": 0.6453, + "step": 6230 + }, + { + "epoch": 2.3001476741324147, + "grad_norm": 0.8922472596168518, + "learning_rate": 3.4226805355170167e-06, + "loss": 0.6085, + "step": 6231 + }, + { + "epoch": 2.3005168594634506, + "grad_norm": 0.8759594559669495, + "learning_rate": 3.4222284808898755e-06, + "loss": 0.6322, + "step": 6232 + }, + { + "epoch": 2.300886044794487, + "grad_norm": 0.8903263807296753, + "learning_rate": 3.421776391356677e-06, + "loss": 0.6185, + "step": 6233 + }, + { + "epoch": 2.301255230125523, + "grad_norm": 0.8800038695335388, + "learning_rate": 3.421324266934534e-06, + "loss": 0.6009, + "step": 6234 + }, + { + "epoch": 2.3016244154565593, + "grad_norm": 0.9056914448738098, + "learning_rate": 3.4208721076405575e-06, + "loss": 0.6395, + "step": 6235 + }, + { + "epoch": 2.301993600787595, + "grad_norm": 0.8917935490608215, + "learning_rate": 3.4204199134918627e-06, + "loss": 0.6254, + "step": 6236 + }, + { + "epoch": 2.3023627861186315, + "grad_norm": 0.9002336859703064, + "learning_rate": 3.419967684505564e-06, + "loss": 0.6281, + "step": 6237 + }, + { + "epoch": 2.302731971449668, + "grad_norm": 0.8894745111465454, + "learning_rate": 3.41951542069878e-06, + "loss": 0.6144, + "step": 6238 + }, + { + "epoch": 2.303101156780704, + "grad_norm": 0.9130890965461731, + "learning_rate": 3.4190631220886266e-06, + "loss": 0.6135, + "step": 6239 + }, + { + "epoch": 2.30347034211174, + "grad_norm": 0.8892707824707031, + "learning_rate": 3.4186107886922245e-06, + "loss": 0.5865, + "step": 6240 + }, + { + "epoch": 2.303839527442776, + "grad_norm": 0.8958227634429932, + "learning_rate": 3.4181584205266937e-06, + "loss": 0.6327, + "step": 6241 + }, + { + "epoch": 2.3042087127738125, + "grad_norm": 0.8840674161911011, + "learning_rate": 3.4177060176091565e-06, + "loss": 0.6349, + "step": 6242 + }, + { + "epoch": 2.304577898104849, + "grad_norm": 0.8614770174026489, + "learning_rate": 3.417253579956736e-06, + "loss": 0.6405, + "step": 6243 + }, + { + "epoch": 2.3049470834358847, + "grad_norm": 0.9040419459342957, + "learning_rate": 3.4168011075865575e-06, + "loss": 0.625, + "step": 6244 + }, + { + "epoch": 2.305316268766921, + "grad_norm": 0.8583013415336609, + "learning_rate": 3.4163486005157464e-06, + "loss": 0.5946, + "step": 6245 + }, + { + "epoch": 2.305685454097957, + "grad_norm": 0.8643393516540527, + "learning_rate": 3.4158960587614298e-06, + "loss": 0.5443, + "step": 6246 + }, + { + "epoch": 2.3060546394289934, + "grad_norm": 0.8592897653579712, + "learning_rate": 3.4154434823407363e-06, + "loss": 0.6056, + "step": 6247 + }, + { + "epoch": 2.3064238247600297, + "grad_norm": 0.881937563419342, + "learning_rate": 3.414990871270796e-06, + "loss": 0.6134, + "step": 6248 + }, + { + "epoch": 2.3067930100910656, + "grad_norm": 0.8865100741386414, + "learning_rate": 3.41453822556874e-06, + "loss": 0.6337, + "step": 6249 + }, + { + "epoch": 2.307162195422102, + "grad_norm": 0.9205448627471924, + "learning_rate": 3.414085545251701e-06, + "loss": 0.6397, + "step": 6250 + }, + { + "epoch": 2.307531380753138, + "grad_norm": 0.8908988237380981, + "learning_rate": 3.4136328303368117e-06, + "loss": 0.6103, + "step": 6251 + }, + { + "epoch": 2.3079005660841743, + "grad_norm": 0.8937252759933472, + "learning_rate": 3.4131800808412087e-06, + "loss": 0.6011, + "step": 6252 + }, + { + "epoch": 2.3082697514152106, + "grad_norm": 0.8901246190071106, + "learning_rate": 3.412727296782028e-06, + "loss": 0.5827, + "step": 6253 + }, + { + "epoch": 2.3086389367462465, + "grad_norm": 0.9124628305435181, + "learning_rate": 3.412274478176407e-06, + "loss": 0.6319, + "step": 6254 + }, + { + "epoch": 2.309008122077283, + "grad_norm": 0.9095810651779175, + "learning_rate": 3.4118216250414852e-06, + "loss": 0.6541, + "step": 6255 + }, + { + "epoch": 2.309377307408319, + "grad_norm": 0.889266312122345, + "learning_rate": 3.4113687373944027e-06, + "loss": 0.603, + "step": 6256 + }, + { + "epoch": 2.309746492739355, + "grad_norm": 0.9117671251296997, + "learning_rate": 3.4109158152523007e-06, + "loss": 0.648, + "step": 6257 + }, + { + "epoch": 2.3101156780703915, + "grad_norm": 0.902371883392334, + "learning_rate": 3.410462858632323e-06, + "loss": 0.6353, + "step": 6258 + }, + { + "epoch": 2.3104848634014274, + "grad_norm": 0.9101449251174927, + "learning_rate": 3.4100098675516136e-06, + "loss": 0.6323, + "step": 6259 + }, + { + "epoch": 2.310854048732464, + "grad_norm": 0.8673690557479858, + "learning_rate": 3.409556842027317e-06, + "loss": 0.6159, + "step": 6260 + }, + { + "epoch": 2.3112232340634997, + "grad_norm": 0.8818574547767639, + "learning_rate": 3.4091037820765816e-06, + "loss": 0.6298, + "step": 6261 + }, + { + "epoch": 2.311592419394536, + "grad_norm": 0.9054052829742432, + "learning_rate": 3.4086506877165547e-06, + "loss": 0.6097, + "step": 6262 + }, + { + "epoch": 2.311961604725572, + "grad_norm": 0.8657010197639465, + "learning_rate": 3.408197558964387e-06, + "loss": 0.6219, + "step": 6263 + }, + { + "epoch": 2.3123307900566084, + "grad_norm": 0.8967298865318298, + "learning_rate": 3.407744395837227e-06, + "loss": 0.6394, + "step": 6264 + }, + { + "epoch": 2.3126999753876447, + "grad_norm": 0.8443505167961121, + "learning_rate": 3.4072911983522285e-06, + "loss": 0.6457, + "step": 6265 + }, + { + "epoch": 2.3130691607186806, + "grad_norm": 0.9098929166793823, + "learning_rate": 3.406837966526545e-06, + "loss": 0.6143, + "step": 6266 + }, + { + "epoch": 2.313438346049717, + "grad_norm": 0.8870005011558533, + "learning_rate": 3.4063847003773304e-06, + "loss": 0.6323, + "step": 6267 + }, + { + "epoch": 2.3138075313807533, + "grad_norm": 0.9079236388206482, + "learning_rate": 3.4059313999217408e-06, + "loss": 0.6024, + "step": 6268 + }, + { + "epoch": 2.3141767167117893, + "grad_norm": 0.8962991833686829, + "learning_rate": 3.4054780651769335e-06, + "loss": 0.6487, + "step": 6269 + }, + { + "epoch": 2.3145459020428256, + "grad_norm": 0.8861629962921143, + "learning_rate": 3.4050246961600674e-06, + "loss": 0.6279, + "step": 6270 + }, + { + "epoch": 2.3149150873738615, + "grad_norm": 0.8593908548355103, + "learning_rate": 3.4045712928883025e-06, + "loss": 0.5604, + "step": 6271 + }, + { + "epoch": 2.315284272704898, + "grad_norm": 0.9024479389190674, + "learning_rate": 3.404117855378799e-06, + "loss": 0.6538, + "step": 6272 + }, + { + "epoch": 2.315653458035934, + "grad_norm": 0.8918688893318176, + "learning_rate": 3.403664383648721e-06, + "loss": 0.6341, + "step": 6273 + }, + { + "epoch": 2.31602264336697, + "grad_norm": 0.8664906620979309, + "learning_rate": 3.4032108777152306e-06, + "loss": 0.6273, + "step": 6274 + }, + { + "epoch": 2.3163918286980065, + "grad_norm": 0.8889983296394348, + "learning_rate": 3.402757337595493e-06, + "loss": 0.5981, + "step": 6275 + }, + { + "epoch": 2.3167610140290424, + "grad_norm": 0.875927746295929, + "learning_rate": 3.4023037633066757e-06, + "loss": 0.656, + "step": 6276 + }, + { + "epoch": 2.317130199360079, + "grad_norm": 0.8894012570381165, + "learning_rate": 3.401850154865946e-06, + "loss": 0.6366, + "step": 6277 + }, + { + "epoch": 2.317499384691115, + "grad_norm": 0.8879987001419067, + "learning_rate": 3.4013965122904724e-06, + "loss": 0.5884, + "step": 6278 + }, + { + "epoch": 2.317868570022151, + "grad_norm": 0.9250081181526184, + "learning_rate": 3.400942835597425e-06, + "loss": 0.6049, + "step": 6279 + }, + { + "epoch": 2.3182377553531874, + "grad_norm": 0.8858482837677002, + "learning_rate": 3.4004891248039764e-06, + "loss": 0.6068, + "step": 6280 + }, + { + "epoch": 2.3186069406842233, + "grad_norm": 0.8867558240890503, + "learning_rate": 3.400035379927298e-06, + "loss": 0.6353, + "step": 6281 + }, + { + "epoch": 2.3189761260152597, + "grad_norm": 0.9119377136230469, + "learning_rate": 3.3995816009845638e-06, + "loss": 0.6768, + "step": 6282 + }, + { + "epoch": 2.3193453113462956, + "grad_norm": 0.8950801491737366, + "learning_rate": 3.399127787992951e-06, + "loss": 0.6397, + "step": 6283 + }, + { + "epoch": 2.319714496677332, + "grad_norm": 0.9186090230941772, + "learning_rate": 3.398673940969635e-06, + "loss": 0.5941, + "step": 6284 + }, + { + "epoch": 2.3200836820083683, + "grad_norm": 0.8988368511199951, + "learning_rate": 3.3982200599317938e-06, + "loss": 0.6015, + "step": 6285 + }, + { + "epoch": 2.3204528673394043, + "grad_norm": 0.8934771418571472, + "learning_rate": 3.3977661448966063e-06, + "loss": 0.6213, + "step": 6286 + }, + { + "epoch": 2.3208220526704406, + "grad_norm": 0.8799237012863159, + "learning_rate": 3.397312195881255e-06, + "loss": 0.6181, + "step": 6287 + }, + { + "epoch": 2.321191238001477, + "grad_norm": 0.8972417712211609, + "learning_rate": 3.3968582129029186e-06, + "loss": 0.5763, + "step": 6288 + }, + { + "epoch": 2.321560423332513, + "grad_norm": 0.8800331950187683, + "learning_rate": 3.396404195978783e-06, + "loss": 0.6033, + "step": 6289 + }, + { + "epoch": 2.3219296086635492, + "grad_norm": 0.8610182404518127, + "learning_rate": 3.3959501451260314e-06, + "loss": 0.5847, + "step": 6290 + }, + { + "epoch": 2.322298793994585, + "grad_norm": 0.9137473702430725, + "learning_rate": 3.395496060361849e-06, + "loss": 0.6327, + "step": 6291 + }, + { + "epoch": 2.3226679793256215, + "grad_norm": 0.8819847702980042, + "learning_rate": 3.3950419417034237e-06, + "loss": 0.6314, + "step": 6292 + }, + { + "epoch": 2.3230371646566574, + "grad_norm": 0.9166555404663086, + "learning_rate": 3.3945877891679435e-06, + "loss": 0.6308, + "step": 6293 + }, + { + "epoch": 2.323406349987694, + "grad_norm": 0.9089335203170776, + "learning_rate": 3.3941336027725973e-06, + "loss": 0.603, + "step": 6294 + }, + { + "epoch": 2.32377553531873, + "grad_norm": 0.8713858127593994, + "learning_rate": 3.393679382534577e-06, + "loss": 0.6219, + "step": 6295 + }, + { + "epoch": 2.324144720649766, + "grad_norm": 0.8974630236625671, + "learning_rate": 3.3932251284710736e-06, + "loss": 0.6152, + "step": 6296 + }, + { + "epoch": 2.3245139059808024, + "grad_norm": 0.9093474745750427, + "learning_rate": 3.3927708405992814e-06, + "loss": 0.6223, + "step": 6297 + }, + { + "epoch": 2.324883091311839, + "grad_norm": 0.8903823494911194, + "learning_rate": 3.392316518936395e-06, + "loss": 0.6255, + "step": 6298 + }, + { + "epoch": 2.3252522766428747, + "grad_norm": 0.8683795928955078, + "learning_rate": 3.391862163499609e-06, + "loss": 0.6069, + "step": 6299 + }, + { + "epoch": 2.325621461973911, + "grad_norm": 0.8939173817634583, + "learning_rate": 3.3914077743061222e-06, + "loss": 0.6596, + "step": 6300 + }, + { + "epoch": 2.325990647304947, + "grad_norm": 0.9083729982376099, + "learning_rate": 3.3909533513731325e-06, + "loss": 0.5985, + "step": 6301 + }, + { + "epoch": 2.3263598326359833, + "grad_norm": 0.8815414309501648, + "learning_rate": 3.3904988947178397e-06, + "loss": 0.5982, + "step": 6302 + }, + { + "epoch": 2.3267290179670193, + "grad_norm": 0.8952083587646484, + "learning_rate": 3.3900444043574444e-06, + "loss": 0.5934, + "step": 6303 + }, + { + "epoch": 2.3270982032980556, + "grad_norm": 0.8855516314506531, + "learning_rate": 3.38958988030915e-06, + "loss": 0.6195, + "step": 6304 + }, + { + "epoch": 2.327467388629092, + "grad_norm": 0.8798015713691711, + "learning_rate": 3.3891353225901587e-06, + "loss": 0.6001, + "step": 6305 + }, + { + "epoch": 2.327836573960128, + "grad_norm": 0.870548665523529, + "learning_rate": 3.388680731217676e-06, + "loss": 0.6476, + "step": 6306 + }, + { + "epoch": 2.3282057592911642, + "grad_norm": 0.8709492683410645, + "learning_rate": 3.3882261062089085e-06, + "loss": 0.5728, + "step": 6307 + }, + { + "epoch": 2.3285749446222, + "grad_norm": 0.8850780129432678, + "learning_rate": 3.387771447581063e-06, + "loss": 0.594, + "step": 6308 + }, + { + "epoch": 2.3289441299532365, + "grad_norm": 0.8986620306968689, + "learning_rate": 3.387316755351348e-06, + "loss": 0.6223, + "step": 6309 + }, + { + "epoch": 2.329313315284273, + "grad_norm": 0.8928316235542297, + "learning_rate": 3.3868620295369748e-06, + "loss": 0.6438, + "step": 6310 + }, + { + "epoch": 2.329682500615309, + "grad_norm": 0.8966062664985657, + "learning_rate": 3.386407270155153e-06, + "loss": 0.5738, + "step": 6311 + }, + { + "epoch": 2.330051685946345, + "grad_norm": 0.863533616065979, + "learning_rate": 3.3859524772230963e-06, + "loss": 0.6056, + "step": 6312 + }, + { + "epoch": 2.330420871277381, + "grad_norm": 0.8805823922157288, + "learning_rate": 3.385497650758017e-06, + "loss": 0.6652, + "step": 6313 + }, + { + "epoch": 2.3307900566084174, + "grad_norm": 0.9126018285751343, + "learning_rate": 3.385042790777132e-06, + "loss": 0.6294, + "step": 6314 + }, + { + "epoch": 2.331159241939454, + "grad_norm": 0.8768594264984131, + "learning_rate": 3.3845878972976565e-06, + "loss": 0.6253, + "step": 6315 + }, + { + "epoch": 2.3315284272704897, + "grad_norm": 0.899366557598114, + "learning_rate": 3.384132970336808e-06, + "loss": 0.617, + "step": 6316 + }, + { + "epoch": 2.331897612601526, + "grad_norm": 0.8887999057769775, + "learning_rate": 3.3836780099118056e-06, + "loss": 0.6218, + "step": 6317 + }, + { + "epoch": 2.332266797932562, + "grad_norm": 0.8769640922546387, + "learning_rate": 3.38322301603987e-06, + "loss": 0.6101, + "step": 6318 + }, + { + "epoch": 2.3326359832635983, + "grad_norm": 0.9050132036209106, + "learning_rate": 3.3827679887382214e-06, + "loss": 0.6523, + "step": 6319 + }, + { + "epoch": 2.3330051685946347, + "grad_norm": 0.9050790667533875, + "learning_rate": 3.3823129280240834e-06, + "loss": 0.6419, + "step": 6320 + }, + { + "epoch": 2.3333743539256706, + "grad_norm": 0.9192018508911133, + "learning_rate": 3.381857833914679e-06, + "loss": 0.6383, + "step": 6321 + }, + { + "epoch": 2.333743539256707, + "grad_norm": 0.9130533337593079, + "learning_rate": 3.3814027064272343e-06, + "loss": 0.6124, + "step": 6322 + }, + { + "epoch": 2.334112724587743, + "grad_norm": 0.9056592583656311, + "learning_rate": 3.3809475455789755e-06, + "loss": 0.6, + "step": 6323 + }, + { + "epoch": 2.3344819099187792, + "grad_norm": 0.8882870078086853, + "learning_rate": 3.38049235138713e-06, + "loss": 0.6281, + "step": 6324 + }, + { + "epoch": 2.3348510952498156, + "grad_norm": 0.8765662312507629, + "learning_rate": 3.3800371238689265e-06, + "loss": 0.5687, + "step": 6325 + }, + { + "epoch": 2.3352202805808515, + "grad_norm": 0.874826192855835, + "learning_rate": 3.3795818630415963e-06, + "loss": 0.5954, + "step": 6326 + }, + { + "epoch": 2.335589465911888, + "grad_norm": 0.9224770069122314, + "learning_rate": 3.3791265689223695e-06, + "loss": 0.6295, + "step": 6327 + }, + { + "epoch": 2.335958651242924, + "grad_norm": 0.8495580554008484, + "learning_rate": 3.37867124152848e-06, + "loss": 0.575, + "step": 6328 + }, + { + "epoch": 2.33632783657396, + "grad_norm": 0.8767423629760742, + "learning_rate": 3.3782158808771608e-06, + "loss": 0.6091, + "step": 6329 + }, + { + "epoch": 2.336697021904996, + "grad_norm": 0.8993131518363953, + "learning_rate": 3.3777604869856483e-06, + "loss": 0.5933, + "step": 6330 + }, + { + "epoch": 2.3370662072360324, + "grad_norm": 0.8980692028999329, + "learning_rate": 3.377305059871178e-06, + "loss": 0.6144, + "step": 6331 + }, + { + "epoch": 2.337435392567069, + "grad_norm": 0.8966735601425171, + "learning_rate": 3.376849599550988e-06, + "loss": 0.6168, + "step": 6332 + }, + { + "epoch": 2.3378045778981047, + "grad_norm": 0.8800783157348633, + "learning_rate": 3.3763941060423178e-06, + "loss": 0.5937, + "step": 6333 + }, + { + "epoch": 2.338173763229141, + "grad_norm": 0.8669251203536987, + "learning_rate": 3.3759385793624073e-06, + "loss": 0.5986, + "step": 6334 + }, + { + "epoch": 2.3385429485601774, + "grad_norm": 0.8944815397262573, + "learning_rate": 3.3754830195284978e-06, + "loss": 0.6265, + "step": 6335 + }, + { + "epoch": 2.3389121338912133, + "grad_norm": 0.8990873098373413, + "learning_rate": 3.375027426557832e-06, + "loss": 0.6541, + "step": 6336 + }, + { + "epoch": 2.3392813192222497, + "grad_norm": 0.9112443327903748, + "learning_rate": 3.374571800467654e-06, + "loss": 0.6391, + "step": 6337 + }, + { + "epoch": 2.3396505045532856, + "grad_norm": 0.863132119178772, + "learning_rate": 3.3741161412752103e-06, + "loss": 0.6209, + "step": 6338 + }, + { + "epoch": 2.340019689884322, + "grad_norm": 0.9120559096336365, + "learning_rate": 3.3736604489977465e-06, + "loss": 0.6423, + "step": 6339 + }, + { + "epoch": 2.340388875215358, + "grad_norm": 0.8751650452613831, + "learning_rate": 3.37320472365251e-06, + "loss": 0.5847, + "step": 6340 + }, + { + "epoch": 2.3407580605463942, + "grad_norm": 0.9585590958595276, + "learning_rate": 3.37274896525675e-06, + "loss": 0.6198, + "step": 6341 + }, + { + "epoch": 2.3411272458774306, + "grad_norm": 0.8802034854888916, + "learning_rate": 3.3722931738277176e-06, + "loss": 0.6172, + "step": 6342 + }, + { + "epoch": 2.3414964312084665, + "grad_norm": 0.8840614557266235, + "learning_rate": 3.371837349382664e-06, + "loss": 0.6106, + "step": 6343 + }, + { + "epoch": 2.341865616539503, + "grad_norm": 0.8793770670890808, + "learning_rate": 3.3713814919388423e-06, + "loss": 0.5924, + "step": 6344 + }, + { + "epoch": 2.3422348018705392, + "grad_norm": 0.8946751356124878, + "learning_rate": 3.370925601513505e-06, + "loss": 0.5867, + "step": 6345 + }, + { + "epoch": 2.342603987201575, + "grad_norm": 0.9340003132820129, + "learning_rate": 3.3704696781239087e-06, + "loss": 0.6286, + "step": 6346 + }, + { + "epoch": 2.3429731725326115, + "grad_norm": 0.8894443511962891, + "learning_rate": 3.3700137217873104e-06, + "loss": 0.6303, + "step": 6347 + }, + { + "epoch": 2.3433423578636474, + "grad_norm": 0.8947777152061462, + "learning_rate": 3.369557732520967e-06, + "loss": 0.6185, + "step": 6348 + }, + { + "epoch": 2.3437115431946838, + "grad_norm": 0.8844276070594788, + "learning_rate": 3.369101710342138e-06, + "loss": 0.6025, + "step": 6349 + }, + { + "epoch": 2.3440807285257197, + "grad_norm": 0.858999490737915, + "learning_rate": 3.3686456552680833e-06, + "loss": 0.6686, + "step": 6350 + }, + { + "epoch": 2.344449913856756, + "grad_norm": 0.9081360697746277, + "learning_rate": 3.3681895673160646e-06, + "loss": 0.6182, + "step": 6351 + }, + { + "epoch": 2.3448190991877924, + "grad_norm": 0.857559323310852, + "learning_rate": 3.3677334465033455e-06, + "loss": 0.6567, + "step": 6352 + }, + { + "epoch": 2.3451882845188283, + "grad_norm": 0.8883553147315979, + "learning_rate": 3.3672772928471885e-06, + "loss": 0.5894, + "step": 6353 + }, + { + "epoch": 2.3455574698498647, + "grad_norm": 0.8662473559379578, + "learning_rate": 3.3668211063648605e-06, + "loss": 0.5919, + "step": 6354 + }, + { + "epoch": 2.345926655180901, + "grad_norm": 0.9215657711029053, + "learning_rate": 3.366364887073626e-06, + "loss": 0.6106, + "step": 6355 + }, + { + "epoch": 2.346295840511937, + "grad_norm": 0.8903319239616394, + "learning_rate": 3.3659086349907554e-06, + "loss": 0.5985, + "step": 6356 + }, + { + "epoch": 2.3466650258429733, + "grad_norm": 0.9090496301651001, + "learning_rate": 3.3654523501335152e-06, + "loss": 0.6531, + "step": 6357 + }, + { + "epoch": 2.3470342111740092, + "grad_norm": 0.9018543362617493, + "learning_rate": 3.364996032519178e-06, + "loss": 0.6318, + "step": 6358 + }, + { + "epoch": 2.3474033965050456, + "grad_norm": 0.9077848196029663, + "learning_rate": 3.364539682165012e-06, + "loss": 0.6142, + "step": 6359 + }, + { + "epoch": 2.3477725818360815, + "grad_norm": 0.8895348310470581, + "learning_rate": 3.3640832990882932e-06, + "loss": 0.6767, + "step": 6360 + }, + { + "epoch": 2.348141767167118, + "grad_norm": 0.8806608319282532, + "learning_rate": 3.3636268833062935e-06, + "loss": 0.5871, + "step": 6361 + }, + { + "epoch": 2.3485109524981542, + "grad_norm": 0.8934316039085388, + "learning_rate": 3.36317043483629e-06, + "loss": 0.6199, + "step": 6362 + }, + { + "epoch": 2.34888013782919, + "grad_norm": 0.9102845788002014, + "learning_rate": 3.3627139536955565e-06, + "loss": 0.6111, + "step": 6363 + }, + { + "epoch": 2.3492493231602265, + "grad_norm": 0.885164737701416, + "learning_rate": 3.3622574399013734e-06, + "loss": 0.6616, + "step": 6364 + }, + { + "epoch": 2.349618508491263, + "grad_norm": 0.8890930414199829, + "learning_rate": 3.361800893471017e-06, + "loss": 0.6079, + "step": 6365 + }, + { + "epoch": 2.3499876938222988, + "grad_norm": 0.8699433207511902, + "learning_rate": 3.36134431442177e-06, + "loss": 0.6402, + "step": 6366 + }, + { + "epoch": 2.350356879153335, + "grad_norm": 0.8776676654815674, + "learning_rate": 3.3608877027709117e-06, + "loss": 0.5985, + "step": 6367 + }, + { + "epoch": 2.350726064484371, + "grad_norm": 1.0655444860458374, + "learning_rate": 3.360431058535727e-06, + "loss": 0.6384, + "step": 6368 + }, + { + "epoch": 2.3510952498154074, + "grad_norm": 0.8856361508369446, + "learning_rate": 3.3599743817334964e-06, + "loss": 0.6038, + "step": 6369 + }, + { + "epoch": 2.3514644351464433, + "grad_norm": 0.8589296936988831, + "learning_rate": 3.3595176723815082e-06, + "loss": 0.6445, + "step": 6370 + }, + { + "epoch": 2.3518336204774797, + "grad_norm": 0.869274377822876, + "learning_rate": 3.3590609304970467e-06, + "loss": 0.6026, + "step": 6371 + }, + { + "epoch": 2.352202805808516, + "grad_norm": 0.9041165709495544, + "learning_rate": 3.3586041560974005e-06, + "loss": 0.6088, + "step": 6372 + }, + { + "epoch": 2.352571991139552, + "grad_norm": 0.872183620929718, + "learning_rate": 3.3581473491998572e-06, + "loss": 0.5902, + "step": 6373 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.8950784206390381, + "learning_rate": 3.357690509821708e-06, + "loss": 0.6517, + "step": 6374 + }, + { + "epoch": 2.3533103618016242, + "grad_norm": 0.9352660775184631, + "learning_rate": 3.357233637980244e-06, + "loss": 0.6587, + "step": 6375 + }, + { + "epoch": 2.3536795471326606, + "grad_norm": 0.9068261384963989, + "learning_rate": 3.3567767336927577e-06, + "loss": 0.6253, + "step": 6376 + }, + { + "epoch": 2.354048732463697, + "grad_norm": 0.8698081970214844, + "learning_rate": 3.3563197969765415e-06, + "loss": 0.6182, + "step": 6377 + }, + { + "epoch": 2.354417917794733, + "grad_norm": 0.8820944428443909, + "learning_rate": 3.355862827848892e-06, + "loss": 0.6241, + "step": 6378 + }, + { + "epoch": 2.354787103125769, + "grad_norm": 0.9108041524887085, + "learning_rate": 3.3554058263271032e-06, + "loss": 0.5874, + "step": 6379 + }, + { + "epoch": 2.355156288456805, + "grad_norm": 0.9089260101318359, + "learning_rate": 3.3549487924284757e-06, + "loss": 0.6404, + "step": 6380 + }, + { + "epoch": 2.3555254737878415, + "grad_norm": 0.8860456347465515, + "learning_rate": 3.3544917261703045e-06, + "loss": 0.6414, + "step": 6381 + }, + { + "epoch": 2.355894659118878, + "grad_norm": 0.9201451539993286, + "learning_rate": 3.354034627569893e-06, + "loss": 0.6545, + "step": 6382 + }, + { + "epoch": 2.3562638444499138, + "grad_norm": 0.841791570186615, + "learning_rate": 3.3535774966445388e-06, + "loss": 0.5478, + "step": 6383 + }, + { + "epoch": 2.35663302978095, + "grad_norm": 0.874345064163208, + "learning_rate": 3.353120333411547e-06, + "loss": 0.6461, + "step": 6384 + }, + { + "epoch": 2.357002215111986, + "grad_norm": 0.8625495433807373, + "learning_rate": 3.3526631378882183e-06, + "loss": 0.6376, + "step": 6385 + }, + { + "epoch": 2.3573714004430224, + "grad_norm": 0.9043352007865906, + "learning_rate": 3.35220591009186e-06, + "loss": 0.67, + "step": 6386 + }, + { + "epoch": 2.3577405857740588, + "grad_norm": 0.8766943216323853, + "learning_rate": 3.351748650039777e-06, + "loss": 0.6109, + "step": 6387 + }, + { + "epoch": 2.3581097711050947, + "grad_norm": 0.8899180293083191, + "learning_rate": 3.3512913577492763e-06, + "loss": 0.6344, + "step": 6388 + }, + { + "epoch": 2.358478956436131, + "grad_norm": 0.9236246943473816, + "learning_rate": 3.350834033237666e-06, + "loss": 0.6248, + "step": 6389 + }, + { + "epoch": 2.358848141767167, + "grad_norm": 0.8563353419303894, + "learning_rate": 3.3503766765222566e-06, + "loss": 0.6176, + "step": 6390 + }, + { + "epoch": 2.3592173270982033, + "grad_norm": 0.8466917276382446, + "learning_rate": 3.3499192876203577e-06, + "loss": 0.5923, + "step": 6391 + }, + { + "epoch": 2.3595865124292397, + "grad_norm": 0.8853041529655457, + "learning_rate": 3.3494618665492833e-06, + "loss": 0.5878, + "step": 6392 + }, + { + "epoch": 2.3599556977602756, + "grad_norm": 0.875528872013092, + "learning_rate": 3.349004413326344e-06, + "loss": 0.5984, + "step": 6393 + }, + { + "epoch": 2.360324883091312, + "grad_norm": 0.8980870842933655, + "learning_rate": 3.3485469279688565e-06, + "loss": 0.6197, + "step": 6394 + }, + { + "epoch": 2.360694068422348, + "grad_norm": 0.9091758728027344, + "learning_rate": 3.3480894104941346e-06, + "loss": 0.6691, + "step": 6395 + }, + { + "epoch": 2.361063253753384, + "grad_norm": 0.8966889977455139, + "learning_rate": 3.3476318609194973e-06, + "loss": 0.6187, + "step": 6396 + }, + { + "epoch": 2.36143243908442, + "grad_norm": 0.873165488243103, + "learning_rate": 3.3471742792622608e-06, + "loss": 0.6301, + "step": 6397 + }, + { + "epoch": 2.3618016244154565, + "grad_norm": 0.8999606370925903, + "learning_rate": 3.3467166655397453e-06, + "loss": 0.6151, + "step": 6398 + }, + { + "epoch": 2.362170809746493, + "grad_norm": 0.9092506766319275, + "learning_rate": 3.3462590197692712e-06, + "loss": 0.679, + "step": 6399 + }, + { + "epoch": 2.3625399950775288, + "grad_norm": 0.8985182642936707, + "learning_rate": 3.3458013419681607e-06, + "loss": 0.6115, + "step": 6400 + }, + { + "epoch": 2.362909180408565, + "grad_norm": 0.8679935932159424, + "learning_rate": 3.345343632153736e-06, + "loss": 0.6085, + "step": 6401 + }, + { + "epoch": 2.3632783657396015, + "grad_norm": 0.8930963277816772, + "learning_rate": 3.344885890343322e-06, + "loss": 0.6206, + "step": 6402 + }, + { + "epoch": 2.3636475510706374, + "grad_norm": 0.8877418637275696, + "learning_rate": 3.344428116554243e-06, + "loss": 0.6172, + "step": 6403 + }, + { + "epoch": 2.3640167364016738, + "grad_norm": 0.8850848078727722, + "learning_rate": 3.343970310803827e-06, + "loss": 0.6334, + "step": 6404 + }, + { + "epoch": 2.3643859217327097, + "grad_norm": 0.8826606273651123, + "learning_rate": 3.3435124731094003e-06, + "loss": 0.6111, + "step": 6405 + }, + { + "epoch": 2.364755107063746, + "grad_norm": 0.8824233412742615, + "learning_rate": 3.3430546034882937e-06, + "loss": 0.6402, + "step": 6406 + }, + { + "epoch": 2.365124292394782, + "grad_norm": 0.8878569602966309, + "learning_rate": 3.342596701957835e-06, + "loss": 0.6031, + "step": 6407 + }, + { + "epoch": 2.3654934777258183, + "grad_norm": 0.8891258239746094, + "learning_rate": 3.342138768535359e-06, + "loss": 0.6057, + "step": 6408 + }, + { + "epoch": 2.3658626630568547, + "grad_norm": 0.8711044192314148, + "learning_rate": 3.3416808032381955e-06, + "loss": 0.5928, + "step": 6409 + }, + { + "epoch": 2.3662318483878906, + "grad_norm": 0.9017554521560669, + "learning_rate": 3.3412228060836795e-06, + "loss": 0.622, + "step": 6410 + }, + { + "epoch": 2.366601033718927, + "grad_norm": 0.8699018955230713, + "learning_rate": 3.340764777089145e-06, + "loss": 0.6544, + "step": 6411 + }, + { + "epoch": 2.3669702190499633, + "grad_norm": 0.9148643612861633, + "learning_rate": 3.3403067162719296e-06, + "loss": 0.618, + "step": 6412 + }, + { + "epoch": 2.367339404380999, + "grad_norm": 0.9592782258987427, + "learning_rate": 3.3398486236493694e-06, + "loss": 0.6495, + "step": 6413 + }, + { + "epoch": 2.3677085897120356, + "grad_norm": 0.9230630397796631, + "learning_rate": 3.339390499238805e-06, + "loss": 0.6488, + "step": 6414 + }, + { + "epoch": 2.3680777750430715, + "grad_norm": 0.8855714201927185, + "learning_rate": 3.338932343057574e-06, + "loss": 0.6045, + "step": 6415 + }, + { + "epoch": 2.368446960374108, + "grad_norm": 0.8813384175300598, + "learning_rate": 3.33847415512302e-06, + "loss": 0.6463, + "step": 6416 + }, + { + "epoch": 2.3688161457051438, + "grad_norm": 0.8758666515350342, + "learning_rate": 3.338015935452482e-06, + "loss": 0.6245, + "step": 6417 + }, + { + "epoch": 2.36918533103618, + "grad_norm": 0.9103012084960938, + "learning_rate": 3.337557684063307e-06, + "loss": 0.6375, + "step": 6418 + }, + { + "epoch": 2.3695545163672165, + "grad_norm": 0.8979682326316833, + "learning_rate": 3.3370994009728374e-06, + "loss": 0.6283, + "step": 6419 + }, + { + "epoch": 2.3699237016982524, + "grad_norm": 0.8930819034576416, + "learning_rate": 3.33664108619842e-06, + "loss": 0.6198, + "step": 6420 + }, + { + "epoch": 2.3702928870292888, + "grad_norm": 0.8969647288322449, + "learning_rate": 3.336182739757401e-06, + "loss": 0.5906, + "step": 6421 + }, + { + "epoch": 2.370662072360325, + "grad_norm": 0.8891032934188843, + "learning_rate": 3.3357243616671293e-06, + "loss": 0.6057, + "step": 6422 + }, + { + "epoch": 2.371031257691361, + "grad_norm": 0.8816516399383545, + "learning_rate": 3.3352659519449537e-06, + "loss": 0.6218, + "step": 6423 + }, + { + "epoch": 2.3714004430223974, + "grad_norm": 0.8681266903877258, + "learning_rate": 3.3348075106082266e-06, + "loss": 0.5927, + "step": 6424 + }, + { + "epoch": 2.3717696283534333, + "grad_norm": 0.8963855504989624, + "learning_rate": 3.3343490376742983e-06, + "loss": 0.6116, + "step": 6425 + }, + { + "epoch": 2.3721388136844697, + "grad_norm": 0.8956697583198547, + "learning_rate": 3.333890533160522e-06, + "loss": 0.6255, + "step": 6426 + }, + { + "epoch": 2.3725079990155056, + "grad_norm": 0.9170622825622559, + "learning_rate": 3.3334319970842523e-06, + "loss": 0.6272, + "step": 6427 + }, + { + "epoch": 2.372877184346542, + "grad_norm": 1.020965576171875, + "learning_rate": 3.332973429462845e-06, + "loss": 0.6211, + "step": 6428 + }, + { + "epoch": 2.3732463696775783, + "grad_norm": 0.8699204325675964, + "learning_rate": 3.3325148303136555e-06, + "loss": 0.6419, + "step": 6429 + }, + { + "epoch": 2.373615555008614, + "grad_norm": 0.9055162072181702, + "learning_rate": 3.332056199654043e-06, + "loss": 0.6476, + "step": 6430 + }, + { + "epoch": 2.3739847403396506, + "grad_norm": 0.8626404404640198, + "learning_rate": 3.3315975375013654e-06, + "loss": 0.5933, + "step": 6431 + }, + { + "epoch": 2.374353925670687, + "grad_norm": 0.8913437724113464, + "learning_rate": 3.331138843872984e-06, + "loss": 0.6105, + "step": 6432 + }, + { + "epoch": 2.374723111001723, + "grad_norm": 0.8726436495780945, + "learning_rate": 3.330680118786259e-06, + "loss": 0.6434, + "step": 6433 + }, + { + "epoch": 2.375092296332759, + "grad_norm": 0.8895217776298523, + "learning_rate": 3.3302213622585544e-06, + "loss": 0.6091, + "step": 6434 + }, + { + "epoch": 2.375461481663795, + "grad_norm": 0.8720279335975647, + "learning_rate": 3.329762574307233e-06, + "loss": 0.6863, + "step": 6435 + }, + { + "epoch": 2.3758306669948315, + "grad_norm": 0.908521294593811, + "learning_rate": 3.3293037549496598e-06, + "loss": 0.623, + "step": 6436 + }, + { + "epoch": 2.3761998523258674, + "grad_norm": 0.8761116862297058, + "learning_rate": 3.328844904203201e-06, + "loss": 0.6015, + "step": 6437 + }, + { + "epoch": 2.3765690376569037, + "grad_norm": 0.8794270157814026, + "learning_rate": 3.3283860220852256e-06, + "loss": 0.5999, + "step": 6438 + }, + { + "epoch": 2.37693822298794, + "grad_norm": 0.8743786811828613, + "learning_rate": 3.3279271086130994e-06, + "loss": 0.6427, + "step": 6439 + }, + { + "epoch": 2.377307408318976, + "grad_norm": 0.8855113387107849, + "learning_rate": 3.327468163804194e-06, + "loss": 0.6083, + "step": 6440 + }, + { + "epoch": 2.3776765936500124, + "grad_norm": 0.8477702736854553, + "learning_rate": 3.3270091876758786e-06, + "loss": 0.5782, + "step": 6441 + }, + { + "epoch": 2.3780457789810483, + "grad_norm": 0.8926432132720947, + "learning_rate": 3.326550180245528e-06, + "loss": 0.5918, + "step": 6442 + }, + { + "epoch": 2.3784149643120847, + "grad_norm": 0.8844828605651855, + "learning_rate": 3.326091141530513e-06, + "loss": 0.5987, + "step": 6443 + }, + { + "epoch": 2.378784149643121, + "grad_norm": 0.8962540030479431, + "learning_rate": 3.32563207154821e-06, + "loss": 0.6024, + "step": 6444 + }, + { + "epoch": 2.379153334974157, + "grad_norm": 0.8820509314537048, + "learning_rate": 3.325172970315992e-06, + "loss": 0.6048, + "step": 6445 + }, + { + "epoch": 2.3795225203051933, + "grad_norm": 0.8636883497238159, + "learning_rate": 3.324713837851239e-06, + "loss": 0.6366, + "step": 6446 + }, + { + "epoch": 2.379891705636229, + "grad_norm": 0.9305062890052795, + "learning_rate": 3.3242546741713266e-06, + "loss": 0.6285, + "step": 6447 + }, + { + "epoch": 2.3802608909672656, + "grad_norm": 0.9081406593322754, + "learning_rate": 3.3237954792936355e-06, + "loss": 0.6145, + "step": 6448 + }, + { + "epoch": 2.380630076298302, + "grad_norm": 0.908954381942749, + "learning_rate": 3.323336253235545e-06, + "loss": 0.6526, + "step": 6449 + }, + { + "epoch": 2.380999261629338, + "grad_norm": 0.8315276503562927, + "learning_rate": 3.322876996014438e-06, + "loss": 0.5905, + "step": 6450 + }, + { + "epoch": 2.381368446960374, + "grad_norm": 0.9024296402931213, + "learning_rate": 3.3224177076476953e-06, + "loss": 0.649, + "step": 6451 + }, + { + "epoch": 2.38173763229141, + "grad_norm": 0.8760672807693481, + "learning_rate": 3.321958388152703e-06, + "loss": 0.6177, + "step": 6452 + }, + { + "epoch": 2.3821068176224465, + "grad_norm": 0.8960379958152771, + "learning_rate": 3.321499037546845e-06, + "loss": 0.6444, + "step": 6453 + }, + { + "epoch": 2.382476002953483, + "grad_norm": 0.8897546529769897, + "learning_rate": 3.3210396558475073e-06, + "loss": 0.6276, + "step": 6454 + }, + { + "epoch": 2.3828451882845187, + "grad_norm": 0.9274020195007324, + "learning_rate": 3.320580243072077e-06, + "loss": 0.6294, + "step": 6455 + }, + { + "epoch": 2.383214373615555, + "grad_norm": 0.886806845664978, + "learning_rate": 3.3201207992379446e-06, + "loss": 0.6446, + "step": 6456 + }, + { + "epoch": 2.383583558946591, + "grad_norm": 0.8872087001800537, + "learning_rate": 3.319661324362498e-06, + "loss": 0.5831, + "step": 6457 + }, + { + "epoch": 2.3839527442776274, + "grad_norm": 0.8909026384353638, + "learning_rate": 3.3192018184631305e-06, + "loss": 0.6195, + "step": 6458 + }, + { + "epoch": 2.3843219296086637, + "grad_norm": 0.8917391896247864, + "learning_rate": 3.3187422815572313e-06, + "loss": 0.6268, + "step": 6459 + }, + { + "epoch": 2.3846911149396997, + "grad_norm": 0.8867877721786499, + "learning_rate": 3.3182827136621946e-06, + "loss": 0.5887, + "step": 6460 + }, + { + "epoch": 2.385060300270736, + "grad_norm": 0.9109401106834412, + "learning_rate": 3.3178231147954165e-06, + "loss": 0.6309, + "step": 6461 + }, + { + "epoch": 2.385429485601772, + "grad_norm": 0.9259582161903381, + "learning_rate": 3.317363484974292e-06, + "loss": 0.6622, + "step": 6462 + }, + { + "epoch": 2.3857986709328083, + "grad_norm": 0.8898127675056458, + "learning_rate": 3.3169038242162164e-06, + "loss": 0.5927, + "step": 6463 + }, + { + "epoch": 2.386167856263844, + "grad_norm": 0.8807457089424133, + "learning_rate": 3.3164441325385894e-06, + "loss": 0.5675, + "step": 6464 + }, + { + "epoch": 2.3865370415948806, + "grad_norm": 0.8943644165992737, + "learning_rate": 3.3159844099588097e-06, + "loss": 0.655, + "step": 6465 + }, + { + "epoch": 2.386906226925917, + "grad_norm": 0.89766526222229, + "learning_rate": 3.315524656494278e-06, + "loss": 0.5982, + "step": 6466 + }, + { + "epoch": 2.387275412256953, + "grad_norm": 0.8820800185203552, + "learning_rate": 3.3150648721623947e-06, + "loss": 0.626, + "step": 6467 + }, + { + "epoch": 2.387644597587989, + "grad_norm": 0.9193159341812134, + "learning_rate": 3.3146050569805644e-06, + "loss": 0.646, + "step": 6468 + }, + { + "epoch": 2.3880137829190256, + "grad_norm": 0.8522205352783203, + "learning_rate": 3.3141452109661877e-06, + "loss": 0.5777, + "step": 6469 + }, + { + "epoch": 2.3883829682500615, + "grad_norm": 0.8985540866851807, + "learning_rate": 3.3136853341366735e-06, + "loss": 0.6315, + "step": 6470 + }, + { + "epoch": 2.388752153581098, + "grad_norm": 0.8889912962913513, + "learning_rate": 3.3132254265094255e-06, + "loss": 0.6319, + "step": 6471 + }, + { + "epoch": 2.3891213389121337, + "grad_norm": 0.8760136961936951, + "learning_rate": 3.3127654881018524e-06, + "loss": 0.58, + "step": 6472 + }, + { + "epoch": 2.38949052424317, + "grad_norm": 0.8871849179267883, + "learning_rate": 3.3123055189313614e-06, + "loss": 0.6047, + "step": 6473 + }, + { + "epoch": 2.389859709574206, + "grad_norm": 0.8809434175491333, + "learning_rate": 3.311845519015363e-06, + "loss": 0.6313, + "step": 6474 + }, + { + "epoch": 2.3902288949052424, + "grad_norm": 0.9147416353225708, + "learning_rate": 3.311385488371268e-06, + "loss": 0.6257, + "step": 6475 + }, + { + "epoch": 2.3905980802362787, + "grad_norm": 0.9294054508209229, + "learning_rate": 3.3109254270164887e-06, + "loss": 0.5943, + "step": 6476 + }, + { + "epoch": 2.3909672655673146, + "grad_norm": 0.8793157935142517, + "learning_rate": 3.3104653349684375e-06, + "loss": 0.5759, + "step": 6477 + }, + { + "epoch": 2.391336450898351, + "grad_norm": 0.880693793296814, + "learning_rate": 3.310005212244529e-06, + "loss": 0.638, + "step": 6478 + }, + { + "epoch": 2.3917056362293874, + "grad_norm": 0.8650186061859131, + "learning_rate": 3.309545058862179e-06, + "loss": 0.5978, + "step": 6479 + }, + { + "epoch": 2.3920748215604233, + "grad_norm": 0.8870683312416077, + "learning_rate": 3.3090848748388042e-06, + "loss": 0.6075, + "step": 6480 + }, + { + "epoch": 2.3924440068914596, + "grad_norm": 0.8827109336853027, + "learning_rate": 3.308624660191822e-06, + "loss": 0.6294, + "step": 6481 + }, + { + "epoch": 2.3928131922224956, + "grad_norm": 0.8798841834068298, + "learning_rate": 3.308164414938652e-06, + "loss": 0.5581, + "step": 6482 + }, + { + "epoch": 2.393182377553532, + "grad_norm": 0.8784766793251038, + "learning_rate": 3.3077041390967134e-06, + "loss": 0.6023, + "step": 6483 + }, + { + "epoch": 2.393551562884568, + "grad_norm": 0.8768050074577332, + "learning_rate": 3.307243832683429e-06, + "loss": 0.6134, + "step": 6484 + }, + { + "epoch": 2.393920748215604, + "grad_norm": 0.8835223913192749, + "learning_rate": 3.306783495716219e-06, + "loss": 0.577, + "step": 6485 + }, + { + "epoch": 2.3942899335466405, + "grad_norm": 0.8789826035499573, + "learning_rate": 3.3063231282125096e-06, + "loss": 0.6035, + "step": 6486 + }, + { + "epoch": 2.3946591188776765, + "grad_norm": 0.9175293445587158, + "learning_rate": 3.305862730189724e-06, + "loss": 0.6459, + "step": 6487 + }, + { + "epoch": 2.395028304208713, + "grad_norm": 0.8951887488365173, + "learning_rate": 3.3054023016652876e-06, + "loss": 0.6302, + "step": 6488 + }, + { + "epoch": 2.395397489539749, + "grad_norm": 0.8854497075080872, + "learning_rate": 3.3049418426566283e-06, + "loss": 0.5861, + "step": 6489 + }, + { + "epoch": 2.395766674870785, + "grad_norm": 0.903958261013031, + "learning_rate": 3.304481353181175e-06, + "loss": 0.626, + "step": 6490 + }, + { + "epoch": 2.3961358602018215, + "grad_norm": 0.9030171036720276, + "learning_rate": 3.3040208332563563e-06, + "loss": 0.6211, + "step": 6491 + }, + { + "epoch": 2.3965050455328574, + "grad_norm": 0.9029189944267273, + "learning_rate": 3.303560282899603e-06, + "loss": 0.6343, + "step": 6492 + }, + { + "epoch": 2.3968742308638937, + "grad_norm": 0.9018768072128296, + "learning_rate": 3.3030997021283456e-06, + "loss": 0.6265, + "step": 6493 + }, + { + "epoch": 2.3972434161949296, + "grad_norm": 0.8870609402656555, + "learning_rate": 3.302639090960019e-06, + "loss": 0.6144, + "step": 6494 + }, + { + "epoch": 2.397612601525966, + "grad_norm": 0.8803600072860718, + "learning_rate": 3.3021784494120557e-06, + "loss": 0.6237, + "step": 6495 + }, + { + "epoch": 2.3979817868570024, + "grad_norm": 0.8544716238975525, + "learning_rate": 3.3017177775018917e-06, + "loss": 0.5684, + "step": 6496 + }, + { + "epoch": 2.3983509721880383, + "grad_norm": 0.8744337558746338, + "learning_rate": 3.301257075246962e-06, + "loss": 0.6235, + "step": 6497 + }, + { + "epoch": 2.3987201575190746, + "grad_norm": 0.897853434085846, + "learning_rate": 3.3007963426647047e-06, + "loss": 0.6239, + "step": 6498 + }, + { + "epoch": 2.399089342850111, + "grad_norm": 0.8761389851570129, + "learning_rate": 3.3003355797725597e-06, + "loss": 0.6114, + "step": 6499 + }, + { + "epoch": 2.399458528181147, + "grad_norm": 0.8760924935340881, + "learning_rate": 3.2998747865879656e-06, + "loss": 0.6186, + "step": 6500 + }, + { + "epoch": 2.3998277135121833, + "grad_norm": 0.9052746891975403, + "learning_rate": 3.2994139631283624e-06, + "loss": 0.623, + "step": 6501 + }, + { + "epoch": 2.400196898843219, + "grad_norm": 0.8953381776809692, + "learning_rate": 3.2989531094111927e-06, + "loss": 0.6085, + "step": 6502 + }, + { + "epoch": 2.4005660841742555, + "grad_norm": 0.8882981538772583, + "learning_rate": 3.2984922254539007e-06, + "loss": 0.6045, + "step": 6503 + }, + { + "epoch": 2.4009352695052915, + "grad_norm": 0.9219051003456116, + "learning_rate": 3.2980313112739302e-06, + "loss": 0.6702, + "step": 6504 + }, + { + "epoch": 2.401304454836328, + "grad_norm": 0.8827004432678223, + "learning_rate": 3.297570366888726e-06, + "loss": 0.6078, + "step": 6505 + }, + { + "epoch": 2.401673640167364, + "grad_norm": 0.8790473937988281, + "learning_rate": 3.2971093923157353e-06, + "loss": 0.607, + "step": 6506 + }, + { + "epoch": 2.4020428254984, + "grad_norm": 0.9405440092086792, + "learning_rate": 3.296648387572405e-06, + "loss": 0.6368, + "step": 6507 + }, + { + "epoch": 2.4024120108294365, + "grad_norm": 0.8994283676147461, + "learning_rate": 3.2961873526761857e-06, + "loss": 0.6052, + "step": 6508 + }, + { + "epoch": 2.4027811961604724, + "grad_norm": 0.9271222352981567, + "learning_rate": 3.295726287644526e-06, + "loss": 0.6717, + "step": 6509 + }, + { + "epoch": 2.4031503814915087, + "grad_norm": 0.8835850358009338, + "learning_rate": 3.2952651924948777e-06, + "loss": 0.6413, + "step": 6510 + }, + { + "epoch": 2.403519566822545, + "grad_norm": 0.9064237475395203, + "learning_rate": 3.2948040672446924e-06, + "loss": 0.6633, + "step": 6511 + }, + { + "epoch": 2.403888752153581, + "grad_norm": 0.9140031933784485, + "learning_rate": 3.2943429119114234e-06, + "loss": 0.6639, + "step": 6512 + }, + { + "epoch": 2.4042579374846174, + "grad_norm": 0.894664466381073, + "learning_rate": 3.293881726512527e-06, + "loss": 0.6126, + "step": 6513 + }, + { + "epoch": 2.4046271228156533, + "grad_norm": 0.8934621810913086, + "learning_rate": 3.2934205110654575e-06, + "loss": 0.6158, + "step": 6514 + }, + { + "epoch": 2.4049963081466896, + "grad_norm": 0.8741678595542908, + "learning_rate": 3.292959265587672e-06, + "loss": 0.6276, + "step": 6515 + }, + { + "epoch": 2.405365493477726, + "grad_norm": 0.9190518856048584, + "learning_rate": 3.2924979900966286e-06, + "loss": 0.6183, + "step": 6516 + }, + { + "epoch": 2.405734678808762, + "grad_norm": 0.9042533040046692, + "learning_rate": 3.2920366846097854e-06, + "loss": 0.6178, + "step": 6517 + }, + { + "epoch": 2.4061038641397983, + "grad_norm": 0.911628007888794, + "learning_rate": 3.2915753491446057e-06, + "loss": 0.6246, + "step": 6518 + }, + { + "epoch": 2.406473049470834, + "grad_norm": 0.8643136620521545, + "learning_rate": 3.2911139837185475e-06, + "loss": 0.6291, + "step": 6519 + }, + { + "epoch": 2.4068422348018705, + "grad_norm": 0.8779479265213013, + "learning_rate": 3.2906525883490746e-06, + "loss": 0.6495, + "step": 6520 + }, + { + "epoch": 2.407211420132907, + "grad_norm": 0.9024735689163208, + "learning_rate": 3.2901911630536516e-06, + "loss": 0.6137, + "step": 6521 + }, + { + "epoch": 2.407580605463943, + "grad_norm": 0.8981119394302368, + "learning_rate": 3.2897297078497412e-06, + "loss": 0.6235, + "step": 6522 + }, + { + "epoch": 2.407949790794979, + "grad_norm": 0.9064521193504333, + "learning_rate": 3.2892682227548115e-06, + "loss": 0.6021, + "step": 6523 + }, + { + "epoch": 2.408318976126015, + "grad_norm": 0.9344003796577454, + "learning_rate": 3.288806707786329e-06, + "loss": 0.6363, + "step": 6524 + }, + { + "epoch": 2.4086881614570514, + "grad_norm": 0.8823830485343933, + "learning_rate": 3.2883451629617615e-06, + "loss": 0.6274, + "step": 6525 + }, + { + "epoch": 2.409057346788088, + "grad_norm": 0.9151976704597473, + "learning_rate": 3.287883588298577e-06, + "loss": 0.6224, + "step": 6526 + }, + { + "epoch": 2.4094265321191237, + "grad_norm": 0.9120372533798218, + "learning_rate": 3.2874219838142484e-06, + "loss": 0.6095, + "step": 6527 + }, + { + "epoch": 2.40979571745016, + "grad_norm": 0.8691409826278687, + "learning_rate": 3.286960349526247e-06, + "loss": 0.6506, + "step": 6528 + }, + { + "epoch": 2.410164902781196, + "grad_norm": 0.9042947292327881, + "learning_rate": 3.2864986854520433e-06, + "loss": 0.681, + "step": 6529 + }, + { + "epoch": 2.4105340881122324, + "grad_norm": 0.8622151613235474, + "learning_rate": 3.286036991609114e-06, + "loss": 0.6174, + "step": 6530 + }, + { + "epoch": 2.4109032734432687, + "grad_norm": 0.9274536371231079, + "learning_rate": 3.285575268014931e-06, + "loss": 0.6642, + "step": 6531 + }, + { + "epoch": 2.4112724587743046, + "grad_norm": 0.8846158385276794, + "learning_rate": 3.2851135146869727e-06, + "loss": 0.625, + "step": 6532 + }, + { + "epoch": 2.411641644105341, + "grad_norm": 0.883685827255249, + "learning_rate": 3.2846517316427163e-06, + "loss": 0.6367, + "step": 6533 + }, + { + "epoch": 2.412010829436377, + "grad_norm": 0.9229357242584229, + "learning_rate": 3.284189918899639e-06, + "loss": 0.611, + "step": 6534 + }, + { + "epoch": 2.4123800147674133, + "grad_norm": 0.9244977235794067, + "learning_rate": 3.2837280764752204e-06, + "loss": 0.6286, + "step": 6535 + }, + { + "epoch": 2.4127492000984496, + "grad_norm": 0.8726721405982971, + "learning_rate": 3.283266204386942e-06, + "loss": 0.6246, + "step": 6536 + }, + { + "epoch": 2.4131183854294855, + "grad_norm": 0.9041004180908203, + "learning_rate": 3.2828043026522847e-06, + "loss": 0.6522, + "step": 6537 + }, + { + "epoch": 2.413487570760522, + "grad_norm": 0.8801578879356384, + "learning_rate": 3.282342371288732e-06, + "loss": 0.5967, + "step": 6538 + }, + { + "epoch": 2.413856756091558, + "grad_norm": 0.8883997797966003, + "learning_rate": 3.281880410313767e-06, + "loss": 0.6317, + "step": 6539 + }, + { + "epoch": 2.414225941422594, + "grad_norm": 0.8998392820358276, + "learning_rate": 3.281418419744875e-06, + "loss": 0.6357, + "step": 6540 + }, + { + "epoch": 2.41459512675363, + "grad_norm": 0.8976721167564392, + "learning_rate": 3.280956399599543e-06, + "loss": 0.5825, + "step": 6541 + }, + { + "epoch": 2.4149643120846664, + "grad_norm": 0.8992498517036438, + "learning_rate": 3.280494349895258e-06, + "loss": 0.6388, + "step": 6542 + }, + { + "epoch": 2.415333497415703, + "grad_norm": 0.9409188628196716, + "learning_rate": 3.2800322706495073e-06, + "loss": 0.6427, + "step": 6543 + }, + { + "epoch": 2.4157026827467387, + "grad_norm": 0.9115458130836487, + "learning_rate": 3.279570161879782e-06, + "loss": 0.6039, + "step": 6544 + }, + { + "epoch": 2.416071868077775, + "grad_norm": 0.9191989302635193, + "learning_rate": 3.279108023603572e-06, + "loss": 0.6344, + "step": 6545 + }, + { + "epoch": 2.4164410534088114, + "grad_norm": 0.8896133899688721, + "learning_rate": 3.278645855838369e-06, + "loss": 0.6261, + "step": 6546 + }, + { + "epoch": 2.4168102387398473, + "grad_norm": 0.9025856852531433, + "learning_rate": 3.2781836586016657e-06, + "loss": 0.6094, + "step": 6547 + }, + { + "epoch": 2.4171794240708837, + "grad_norm": 0.9009783864021301, + "learning_rate": 3.2777214319109576e-06, + "loss": 0.656, + "step": 6548 + }, + { + "epoch": 2.4175486094019196, + "grad_norm": 0.8692270517349243, + "learning_rate": 3.277259175783738e-06, + "loss": 0.5849, + "step": 6549 + }, + { + "epoch": 2.417917794732956, + "grad_norm": 0.8728926181793213, + "learning_rate": 3.2767968902375034e-06, + "loss": 0.5894, + "step": 6550 + }, + { + "epoch": 2.418286980063992, + "grad_norm": 0.8845539689064026, + "learning_rate": 3.2763345752897535e-06, + "loss": 0.6236, + "step": 6551 + }, + { + "epoch": 2.4186561653950283, + "grad_norm": 0.8901984095573425, + "learning_rate": 3.2758722309579834e-06, + "loss": 0.6258, + "step": 6552 + }, + { + "epoch": 2.4190253507260646, + "grad_norm": 0.9099188446998596, + "learning_rate": 3.2754098572596945e-06, + "loss": 0.6056, + "step": 6553 + }, + { + "epoch": 2.4193945360571005, + "grad_norm": 0.9318600296974182, + "learning_rate": 3.2749474542123873e-06, + "loss": 0.6333, + "step": 6554 + }, + { + "epoch": 2.419763721388137, + "grad_norm": 0.9109065532684326, + "learning_rate": 3.2744850218335638e-06, + "loss": 0.605, + "step": 6555 + }, + { + "epoch": 2.4201329067191732, + "grad_norm": 0.9082841873168945, + "learning_rate": 3.274022560140726e-06, + "loss": 0.5962, + "step": 6556 + }, + { + "epoch": 2.420502092050209, + "grad_norm": 0.8787094354629517, + "learning_rate": 3.273560069151379e-06, + "loss": 0.6078, + "step": 6557 + }, + { + "epoch": 2.4208712773812455, + "grad_norm": 0.8968225121498108, + "learning_rate": 3.273097548883028e-06, + "loss": 0.6146, + "step": 6558 + }, + { + "epoch": 2.4212404627122814, + "grad_norm": 0.9018953442573547, + "learning_rate": 3.2726349993531782e-06, + "loss": 0.5938, + "step": 6559 + }, + { + "epoch": 2.421609648043318, + "grad_norm": 0.8900550007820129, + "learning_rate": 3.272172420579337e-06, + "loss": 0.6437, + "step": 6560 + }, + { + "epoch": 2.4219788333743537, + "grad_norm": 0.8907724022865295, + "learning_rate": 3.2717098125790147e-06, + "loss": 0.6141, + "step": 6561 + }, + { + "epoch": 2.42234801870539, + "grad_norm": 0.8702531456947327, + "learning_rate": 3.2712471753697183e-06, + "loss": 0.6023, + "step": 6562 + }, + { + "epoch": 2.4227172040364264, + "grad_norm": 0.8649858832359314, + "learning_rate": 3.2707845089689606e-06, + "loss": 0.6035, + "step": 6563 + }, + { + "epoch": 2.4230863893674623, + "grad_norm": 0.8816810250282288, + "learning_rate": 3.2703218133942516e-06, + "loss": 0.6313, + "step": 6564 + }, + { + "epoch": 2.4234555746984987, + "grad_norm": 0.8708885908126831, + "learning_rate": 3.2698590886631066e-06, + "loss": 0.6028, + "step": 6565 + }, + { + "epoch": 2.423824760029535, + "grad_norm": 0.8675017356872559, + "learning_rate": 3.2693963347930367e-06, + "loss": 0.6036, + "step": 6566 + }, + { + "epoch": 2.424193945360571, + "grad_norm": 0.8589776754379272, + "learning_rate": 3.2689335518015587e-06, + "loss": 0.6344, + "step": 6567 + }, + { + "epoch": 2.4245631306916073, + "grad_norm": 0.8868077993392944, + "learning_rate": 3.2684707397061887e-06, + "loss": 0.6276, + "step": 6568 + }, + { + "epoch": 2.4249323160226433, + "grad_norm": 0.897175669670105, + "learning_rate": 3.2680078985244434e-06, + "loss": 0.5916, + "step": 6569 + }, + { + "epoch": 2.4253015013536796, + "grad_norm": 0.895261824131012, + "learning_rate": 3.267545028273842e-06, + "loss": 0.5836, + "step": 6570 + }, + { + "epoch": 2.4256706866847155, + "grad_norm": 0.882234513759613, + "learning_rate": 3.2670821289719034e-06, + "loss": 0.6075, + "step": 6571 + }, + { + "epoch": 2.426039872015752, + "grad_norm": 0.9079399108886719, + "learning_rate": 3.2666192006361475e-06, + "loss": 0.6604, + "step": 6572 + }, + { + "epoch": 2.4264090573467882, + "grad_norm": 0.9348976016044617, + "learning_rate": 3.2661562432840978e-06, + "loss": 0.6238, + "step": 6573 + }, + { + "epoch": 2.426778242677824, + "grad_norm": 0.9037050604820251, + "learning_rate": 3.2656932569332757e-06, + "loss": 0.6078, + "step": 6574 + }, + { + "epoch": 2.4271474280088605, + "grad_norm": 0.8783547878265381, + "learning_rate": 3.2652302416012053e-06, + "loss": 0.6193, + "step": 6575 + }, + { + "epoch": 2.4275166133398964, + "grad_norm": 0.91499924659729, + "learning_rate": 3.2647671973054117e-06, + "loss": 0.6147, + "step": 6576 + }, + { + "epoch": 2.427885798670933, + "grad_norm": 0.9023700952529907, + "learning_rate": 3.264304124063421e-06, + "loss": 0.6302, + "step": 6577 + }, + { + "epoch": 2.428254984001969, + "grad_norm": 0.9167583584785461, + "learning_rate": 3.263841021892761e-06, + "loss": 0.6527, + "step": 6578 + }, + { + "epoch": 2.428624169333005, + "grad_norm": 0.9320788979530334, + "learning_rate": 3.263377890810959e-06, + "loss": 0.6121, + "step": 6579 + }, + { + "epoch": 2.4289933546640414, + "grad_norm": 0.8515124320983887, + "learning_rate": 3.262914730835545e-06, + "loss": 0.6253, + "step": 6580 + }, + { + "epoch": 2.4293625399950773, + "grad_norm": 0.8823196291923523, + "learning_rate": 3.262451541984049e-06, + "loss": 0.625, + "step": 6581 + }, + { + "epoch": 2.4297317253261137, + "grad_norm": 0.8731393814086914, + "learning_rate": 3.2619883242740036e-06, + "loss": 0.6303, + "step": 6582 + }, + { + "epoch": 2.43010091065715, + "grad_norm": 0.8978605270385742, + "learning_rate": 3.2615250777229395e-06, + "loss": 0.6311, + "step": 6583 + }, + { + "epoch": 2.430470095988186, + "grad_norm": 0.918386697769165, + "learning_rate": 3.261061802348392e-06, + "loss": 0.6011, + "step": 6584 + }, + { + "epoch": 2.4308392813192223, + "grad_norm": 0.8470896482467651, + "learning_rate": 3.2605984981678955e-06, + "loss": 0.6184, + "step": 6585 + }, + { + "epoch": 2.4312084666502582, + "grad_norm": 0.8899707794189453, + "learning_rate": 3.2601351651989853e-06, + "loss": 0.6247, + "step": 6586 + }, + { + "epoch": 2.4315776519812946, + "grad_norm": 0.8970438838005066, + "learning_rate": 3.2596718034591996e-06, + "loss": 0.6431, + "step": 6587 + }, + { + "epoch": 2.431946837312331, + "grad_norm": 0.8892400860786438, + "learning_rate": 3.2592084129660755e-06, + "loss": 0.5699, + "step": 6588 + }, + { + "epoch": 2.432316022643367, + "grad_norm": 0.8903526067733765, + "learning_rate": 3.2587449937371536e-06, + "loss": 0.6205, + "step": 6589 + }, + { + "epoch": 2.4326852079744032, + "grad_norm": 0.8658482432365417, + "learning_rate": 3.258281545789972e-06, + "loss": 0.6461, + "step": 6590 + }, + { + "epoch": 2.433054393305439, + "grad_norm": 0.9259894490242004, + "learning_rate": 3.2578180691420734e-06, + "loss": 0.6259, + "step": 6591 + }, + { + "epoch": 2.4334235786364755, + "grad_norm": 0.9025768041610718, + "learning_rate": 3.2573545638110006e-06, + "loss": 0.6213, + "step": 6592 + }, + { + "epoch": 2.433792763967512, + "grad_norm": 0.8797717094421387, + "learning_rate": 3.2568910298142965e-06, + "loss": 0.6508, + "step": 6593 + }, + { + "epoch": 2.434161949298548, + "grad_norm": 0.8745200037956238, + "learning_rate": 3.256427467169505e-06, + "loss": 0.6464, + "step": 6594 + }, + { + "epoch": 2.434531134629584, + "grad_norm": 0.8925659656524658, + "learning_rate": 3.2559638758941735e-06, + "loss": 0.6186, + "step": 6595 + }, + { + "epoch": 2.43490031996062, + "grad_norm": 0.8711280226707458, + "learning_rate": 3.255500256005848e-06, + "loss": 0.6069, + "step": 6596 + }, + { + "epoch": 2.4352695052916564, + "grad_norm": 0.8804523944854736, + "learning_rate": 3.255036607522075e-06, + "loss": 0.6061, + "step": 6597 + }, + { + "epoch": 2.435638690622693, + "grad_norm": 0.8944711685180664, + "learning_rate": 3.2545729304604056e-06, + "loss": 0.6055, + "step": 6598 + }, + { + "epoch": 2.4360078759537287, + "grad_norm": 0.9016622304916382, + "learning_rate": 3.2541092248383893e-06, + "loss": 0.6585, + "step": 6599 + }, + { + "epoch": 2.436377061284765, + "grad_norm": 0.8732638955116272, + "learning_rate": 3.253645490673576e-06, + "loss": 0.6133, + "step": 6600 + }, + { + "epoch": 2.436746246615801, + "grad_norm": 0.905769944190979, + "learning_rate": 3.2531817279835187e-06, + "loss": 0.5668, + "step": 6601 + }, + { + "epoch": 2.4371154319468373, + "grad_norm": 0.8700900673866272, + "learning_rate": 3.2527179367857713e-06, + "loss": 0.6024, + "step": 6602 + }, + { + "epoch": 2.4374846172778737, + "grad_norm": 0.893319845199585, + "learning_rate": 3.252254117097888e-06, + "loss": 0.6554, + "step": 6603 + }, + { + "epoch": 2.4378538026089096, + "grad_norm": 0.8899886012077332, + "learning_rate": 3.2517902689374226e-06, + "loss": 0.6306, + "step": 6604 + }, + { + "epoch": 2.438222987939946, + "grad_norm": 0.8926476240158081, + "learning_rate": 3.251326392321934e-06, + "loss": 0.6605, + "step": 6605 + }, + { + "epoch": 2.438592173270982, + "grad_norm": 0.8732485771179199, + "learning_rate": 3.2508624872689775e-06, + "loss": 0.5848, + "step": 6606 + }, + { + "epoch": 2.4389613586020182, + "grad_norm": 0.9070883393287659, + "learning_rate": 3.250398553796113e-06, + "loss": 0.635, + "step": 6607 + }, + { + "epoch": 2.439330543933054, + "grad_norm": 0.8809328675270081, + "learning_rate": 3.2499345919209004e-06, + "loss": 0.6121, + "step": 6608 + }, + { + "epoch": 2.4396997292640905, + "grad_norm": 0.9075734615325928, + "learning_rate": 3.2494706016609e-06, + "loss": 0.6173, + "step": 6609 + }, + { + "epoch": 2.440068914595127, + "grad_norm": 0.8899075984954834, + "learning_rate": 3.2490065830336742e-06, + "loss": 0.6318, + "step": 6610 + }, + { + "epoch": 2.440438099926163, + "grad_norm": 0.8936514258384705, + "learning_rate": 3.248542536056785e-06, + "loss": 0.6021, + "step": 6611 + }, + { + "epoch": 2.440807285257199, + "grad_norm": 0.8905037045478821, + "learning_rate": 3.248078460747797e-06, + "loss": 0.6372, + "step": 6612 + }, + { + "epoch": 2.4411764705882355, + "grad_norm": 0.9336492419242859, + "learning_rate": 3.247614357124277e-06, + "loss": 0.6301, + "step": 6613 + }, + { + "epoch": 2.4415456559192714, + "grad_norm": 0.8790932893753052, + "learning_rate": 3.247150225203788e-06, + "loss": 0.5845, + "step": 6614 + }, + { + "epoch": 2.4419148412503078, + "grad_norm": 0.8862748146057129, + "learning_rate": 3.246686065003899e-06, + "loss": 0.6078, + "step": 6615 + }, + { + "epoch": 2.4422840265813437, + "grad_norm": 0.8866193890571594, + "learning_rate": 3.246221876542178e-06, + "loss": 0.6147, + "step": 6616 + }, + { + "epoch": 2.44265321191238, + "grad_norm": 0.8959043025970459, + "learning_rate": 3.2457576598361945e-06, + "loss": 0.644, + "step": 6617 + }, + { + "epoch": 2.443022397243416, + "grad_norm": 0.9016184210777283, + "learning_rate": 3.245293414903519e-06, + "loss": 0.6392, + "step": 6618 + }, + { + "epoch": 2.4433915825744523, + "grad_norm": 0.8923618793487549, + "learning_rate": 3.244829141761724e-06, + "loss": 0.6073, + "step": 6619 + }, + { + "epoch": 2.4437607679054887, + "grad_norm": 0.9090900421142578, + "learning_rate": 3.2443648404283795e-06, + "loss": 0.6048, + "step": 6620 + }, + { + "epoch": 2.4441299532365246, + "grad_norm": 0.8769664764404297, + "learning_rate": 3.2439005109210614e-06, + "loss": 0.636, + "step": 6621 + }, + { + "epoch": 2.444499138567561, + "grad_norm": 0.8924333453178406, + "learning_rate": 3.2434361532573433e-06, + "loss": 0.6035, + "step": 6622 + }, + { + "epoch": 2.4448683238985973, + "grad_norm": 0.8730320334434509, + "learning_rate": 3.2429717674548024e-06, + "loss": 0.6161, + "step": 6623 + }, + { + "epoch": 2.4452375092296332, + "grad_norm": 0.9084175229072571, + "learning_rate": 3.2425073535310136e-06, + "loss": 0.6139, + "step": 6624 + }, + { + "epoch": 2.4456066945606696, + "grad_norm": 0.9035570621490479, + "learning_rate": 3.242042911503557e-06, + "loss": 0.6347, + "step": 6625 + }, + { + "epoch": 2.4459758798917055, + "grad_norm": 0.8734599351882935, + "learning_rate": 3.2415784413900088e-06, + "loss": 0.6079, + "step": 6626 + }, + { + "epoch": 2.446345065222742, + "grad_norm": 0.8977445363998413, + "learning_rate": 3.2411139432079517e-06, + "loss": 0.5796, + "step": 6627 + }, + { + "epoch": 2.446714250553778, + "grad_norm": 0.8780228495597839, + "learning_rate": 3.2406494169749655e-06, + "loss": 0.5874, + "step": 6628 + }, + { + "epoch": 2.447083435884814, + "grad_norm": 0.8812336325645447, + "learning_rate": 3.240184862708633e-06, + "loss": 0.635, + "step": 6629 + }, + { + "epoch": 2.4474526212158505, + "grad_norm": 0.9338408708572388, + "learning_rate": 3.239720280426536e-06, + "loss": 0.6188, + "step": 6630 + }, + { + "epoch": 2.4478218065468864, + "grad_norm": 0.9280979633331299, + "learning_rate": 3.2392556701462605e-06, + "loss": 0.6255, + "step": 6631 + }, + { + "epoch": 2.4481909918779228, + "grad_norm": 0.9264072179794312, + "learning_rate": 3.238791031885391e-06, + "loss": 0.6299, + "step": 6632 + }, + { + "epoch": 2.448560177208959, + "grad_norm": 0.8954188227653503, + "learning_rate": 3.2383263656615147e-06, + "loss": 0.6391, + "step": 6633 + }, + { + "epoch": 2.448929362539995, + "grad_norm": 0.8726531267166138, + "learning_rate": 3.2378616714922184e-06, + "loss": 0.6162, + "step": 6634 + }, + { + "epoch": 2.4492985478710314, + "grad_norm": 0.8964354991912842, + "learning_rate": 3.2373969493950896e-06, + "loss": 0.6467, + "step": 6635 + }, + { + "epoch": 2.4496677332020673, + "grad_norm": 0.8645336627960205, + "learning_rate": 3.23693219938772e-06, + "loss": 0.6399, + "step": 6636 + }, + { + "epoch": 2.4500369185331037, + "grad_norm": 0.8976300358772278, + "learning_rate": 3.2364674214876997e-06, + "loss": 0.6705, + "step": 6637 + }, + { + "epoch": 2.4504061038641396, + "grad_norm": 0.9061161279678345, + "learning_rate": 3.236002615712619e-06, + "loss": 0.6209, + "step": 6638 + }, + { + "epoch": 2.450775289195176, + "grad_norm": 0.8723763227462769, + "learning_rate": 3.235537782080072e-06, + "loss": 0.599, + "step": 6639 + }, + { + "epoch": 2.4511444745262123, + "grad_norm": 0.864839494228363, + "learning_rate": 3.2350729206076515e-06, + "loss": 0.6186, + "step": 6640 + }, + { + "epoch": 2.4515136598572482, + "grad_norm": 0.8541459441184998, + "learning_rate": 3.2346080313129536e-06, + "loss": 0.5986, + "step": 6641 + }, + { + "epoch": 2.4518828451882846, + "grad_norm": 0.8753202557563782, + "learning_rate": 3.234143114213573e-06, + "loss": 0.621, + "step": 6642 + }, + { + "epoch": 2.452252030519321, + "grad_norm": 0.9262859225273132, + "learning_rate": 3.233678169327108e-06, + "loss": 0.6404, + "step": 6643 + }, + { + "epoch": 2.452621215850357, + "grad_norm": 0.8876028656959534, + "learning_rate": 3.2332131966711557e-06, + "loss": 0.6179, + "step": 6644 + }, + { + "epoch": 2.452990401181393, + "grad_norm": 0.9119341373443604, + "learning_rate": 3.2327481962633145e-06, + "loss": 0.5917, + "step": 6645 + }, + { + "epoch": 2.453359586512429, + "grad_norm": 0.9008866548538208, + "learning_rate": 3.2322831681211853e-06, + "loss": 0.6012, + "step": 6646 + }, + { + "epoch": 2.4537287718434655, + "grad_norm": 0.877566397190094, + "learning_rate": 3.2318181122623703e-06, + "loss": 0.6223, + "step": 6647 + }, + { + "epoch": 2.4540979571745014, + "grad_norm": 0.886857271194458, + "learning_rate": 3.2313530287044705e-06, + "loss": 0.614, + "step": 6648 + }, + { + "epoch": 2.4544671425055378, + "grad_norm": 0.883436381816864, + "learning_rate": 3.230887917465089e-06, + "loss": 0.5976, + "step": 6649 + }, + { + "epoch": 2.454836327836574, + "grad_norm": 0.8766323328018188, + "learning_rate": 3.2304227785618305e-06, + "loss": 0.5819, + "step": 6650 + }, + { + "epoch": 2.45520551316761, + "grad_norm": 0.8966810703277588, + "learning_rate": 3.2299576120123006e-06, + "loss": 0.6046, + "step": 6651 + }, + { + "epoch": 2.4555746984986464, + "grad_norm": 0.9047337770462036, + "learning_rate": 3.229492417834105e-06, + "loss": 0.6322, + "step": 6652 + }, + { + "epoch": 2.4559438838296823, + "grad_norm": 0.9142707586288452, + "learning_rate": 3.229027196044853e-06, + "loss": 0.5915, + "step": 6653 + }, + { + "epoch": 2.4563130691607187, + "grad_norm": 0.8842984437942505, + "learning_rate": 3.22856194666215e-06, + "loss": 0.5782, + "step": 6654 + }, + { + "epoch": 2.456682254491755, + "grad_norm": 0.909199595451355, + "learning_rate": 3.2280966697036076e-06, + "loss": 0.6303, + "step": 6655 + }, + { + "epoch": 2.457051439822791, + "grad_norm": 0.9126515984535217, + "learning_rate": 3.2276313651868364e-06, + "loss": 0.6391, + "step": 6656 + }, + { + "epoch": 2.4574206251538273, + "grad_norm": 0.918389618396759, + "learning_rate": 3.227166033129448e-06, + "loss": 0.5976, + "step": 6657 + }, + { + "epoch": 2.4577898104848632, + "grad_norm": 0.8876264095306396, + "learning_rate": 3.2267006735490546e-06, + "loss": 0.6167, + "step": 6658 + }, + { + "epoch": 2.4581589958158996, + "grad_norm": 1.14640212059021, + "learning_rate": 3.2262352864632694e-06, + "loss": 0.621, + "step": 6659 + }, + { + "epoch": 2.458528181146936, + "grad_norm": 0.9019989967346191, + "learning_rate": 3.225769871889708e-06, + "loss": 0.6316, + "step": 6660 + }, + { + "epoch": 2.458897366477972, + "grad_norm": 0.9294628500938416, + "learning_rate": 3.225304429845987e-06, + "loss": 0.6342, + "step": 6661 + }, + { + "epoch": 2.459266551809008, + "grad_norm": 0.8694780468940735, + "learning_rate": 3.224838960349721e-06, + "loss": 0.6291, + "step": 6662 + }, + { + "epoch": 2.459635737140044, + "grad_norm": 0.9159122109413147, + "learning_rate": 3.2243734634185304e-06, + "loss": 0.6046, + "step": 6663 + }, + { + "epoch": 2.4600049224710805, + "grad_norm": 0.8719239234924316, + "learning_rate": 3.2239079390700313e-06, + "loss": 0.6472, + "step": 6664 + }, + { + "epoch": 2.460374107802117, + "grad_norm": 0.8929508924484253, + "learning_rate": 3.2234423873218463e-06, + "loss": 0.6194, + "step": 6665 + }, + { + "epoch": 2.4607432931331528, + "grad_norm": 0.9153685569763184, + "learning_rate": 3.2229768081915947e-06, + "loss": 0.6225, + "step": 6666 + }, + { + "epoch": 2.461112478464189, + "grad_norm": 0.8679944276809692, + "learning_rate": 3.2225112016968997e-06, + "loss": 0.6509, + "step": 6667 + }, + { + "epoch": 2.461481663795225, + "grad_norm": 0.8732336759567261, + "learning_rate": 3.2220455678553826e-06, + "loss": 0.6012, + "step": 6668 + }, + { + "epoch": 2.4618508491262614, + "grad_norm": 0.8894098401069641, + "learning_rate": 3.22157990668467e-06, + "loss": 0.6124, + "step": 6669 + }, + { + "epoch": 2.4622200344572978, + "grad_norm": 0.9187450408935547, + "learning_rate": 3.2211142182023846e-06, + "loss": 0.6277, + "step": 6670 + }, + { + "epoch": 2.4625892197883337, + "grad_norm": 0.9011822938919067, + "learning_rate": 3.2206485024261542e-06, + "loss": 0.6342, + "step": 6671 + }, + { + "epoch": 2.46295840511937, + "grad_norm": 0.9028558135032654, + "learning_rate": 3.220182759373605e-06, + "loss": 0.6335, + "step": 6672 + }, + { + "epoch": 2.463327590450406, + "grad_norm": 0.9237378239631653, + "learning_rate": 3.2197169890623657e-06, + "loss": 0.6141, + "step": 6673 + }, + { + "epoch": 2.4636967757814423, + "grad_norm": 0.917557954788208, + "learning_rate": 3.219251191510066e-06, + "loss": 0.6202, + "step": 6674 + }, + { + "epoch": 2.464065961112478, + "grad_norm": 0.9420355558395386, + "learning_rate": 3.2187853667343357e-06, + "loss": 0.6154, + "step": 6675 + }, + { + "epoch": 2.4644351464435146, + "grad_norm": 0.8752374053001404, + "learning_rate": 3.2183195147528056e-06, + "loss": 0.5876, + "step": 6676 + }, + { + "epoch": 2.464804331774551, + "grad_norm": 0.8776642680168152, + "learning_rate": 3.217853635583109e-06, + "loss": 0.5971, + "step": 6677 + }, + { + "epoch": 2.465173517105587, + "grad_norm": 0.9082798957824707, + "learning_rate": 3.217387729242878e-06, + "loss": 0.6429, + "step": 6678 + }, + { + "epoch": 2.465542702436623, + "grad_norm": 0.8911619782447815, + "learning_rate": 3.216921795749749e-06, + "loss": 0.6122, + "step": 6679 + }, + { + "epoch": 2.4659118877676596, + "grad_norm": 0.90144282579422, + "learning_rate": 3.216455835121356e-06, + "loss": 0.6527, + "step": 6680 + }, + { + "epoch": 2.4662810730986955, + "grad_norm": 0.8904519081115723, + "learning_rate": 3.215989847375336e-06, + "loss": 0.6241, + "step": 6681 + }, + { + "epoch": 2.466650258429732, + "grad_norm": 0.8878113031387329, + "learning_rate": 3.2155238325293253e-06, + "loss": 0.6227, + "step": 6682 + }, + { + "epoch": 2.4670194437607678, + "grad_norm": 0.8909118175506592, + "learning_rate": 3.215057790600964e-06, + "loss": 0.5911, + "step": 6683 + }, + { + "epoch": 2.467388629091804, + "grad_norm": 0.9113565683364868, + "learning_rate": 3.21459172160789e-06, + "loss": 0.6308, + "step": 6684 + }, + { + "epoch": 2.46775781442284, + "grad_norm": 0.8915936350822449, + "learning_rate": 3.2141256255677467e-06, + "loss": 0.6156, + "step": 6685 + }, + { + "epoch": 2.4681269997538764, + "grad_norm": 0.8923668265342712, + "learning_rate": 3.213659502498173e-06, + "loss": 0.6232, + "step": 6686 + }, + { + "epoch": 2.4684961850849128, + "grad_norm": 0.9062240123748779, + "learning_rate": 3.213193352416812e-06, + "loss": 0.6364, + "step": 6687 + }, + { + "epoch": 2.4688653704159487, + "grad_norm": 0.8870229125022888, + "learning_rate": 3.212727175341308e-06, + "loss": 0.6434, + "step": 6688 + }, + { + "epoch": 2.469234555746985, + "grad_norm": 0.8837342858314514, + "learning_rate": 3.2122609712893055e-06, + "loss": 0.6469, + "step": 6689 + }, + { + "epoch": 2.4696037410780214, + "grad_norm": 0.8985797762870789, + "learning_rate": 3.21179474027845e-06, + "loss": 0.6159, + "step": 6690 + }, + { + "epoch": 2.4699729264090573, + "grad_norm": 0.8987293839454651, + "learning_rate": 3.211328482326388e-06, + "loss": 0.6475, + "step": 6691 + }, + { + "epoch": 2.4703421117400937, + "grad_norm": 0.9215559363365173, + "learning_rate": 3.210862197450767e-06, + "loss": 0.6202, + "step": 6692 + }, + { + "epoch": 2.4707112970711296, + "grad_norm": 0.8849296569824219, + "learning_rate": 3.2103958856692373e-06, + "loss": 0.6205, + "step": 6693 + }, + { + "epoch": 2.471080482402166, + "grad_norm": 0.9684334397315979, + "learning_rate": 3.209929546999446e-06, + "loss": 0.6449, + "step": 6694 + }, + { + "epoch": 2.471449667733202, + "grad_norm": 0.8828813433647156, + "learning_rate": 3.209463181459047e-06, + "loss": 0.5994, + "step": 6695 + }, + { + "epoch": 2.471818853064238, + "grad_norm": 0.865740954875946, + "learning_rate": 3.20899678906569e-06, + "loss": 0.6123, + "step": 6696 + }, + { + "epoch": 2.4721880383952746, + "grad_norm": 0.9229961633682251, + "learning_rate": 3.208530369837028e-06, + "loss": 0.6519, + "step": 6697 + }, + { + "epoch": 2.4725572237263105, + "grad_norm": 0.879624605178833, + "learning_rate": 3.2080639237907154e-06, + "loss": 0.5814, + "step": 6698 + }, + { + "epoch": 2.472926409057347, + "grad_norm": 0.9112635254859924, + "learning_rate": 3.2075974509444064e-06, + "loss": 0.6536, + "step": 6699 + }, + { + "epoch": 2.473295594388383, + "grad_norm": 0.8843146562576294, + "learning_rate": 3.2071309513157577e-06, + "loss": 0.6471, + "step": 6700 + }, + { + "epoch": 2.473664779719419, + "grad_norm": 0.8899278044700623, + "learning_rate": 3.2066644249224255e-06, + "loss": 0.6249, + "step": 6701 + }, + { + "epoch": 2.4740339650504555, + "grad_norm": 0.8777864575386047, + "learning_rate": 3.2061978717820667e-06, + "loss": 0.6516, + "step": 6702 + }, + { + "epoch": 2.4744031503814914, + "grad_norm": 0.9046998023986816, + "learning_rate": 3.2057312919123428e-06, + "loss": 0.6135, + "step": 6703 + }, + { + "epoch": 2.4747723357125277, + "grad_norm": 0.9134575128555298, + "learning_rate": 3.2052646853309122e-06, + "loss": 0.6292, + "step": 6704 + }, + { + "epoch": 2.4751415210435637, + "grad_norm": 0.9009960889816284, + "learning_rate": 3.2047980520554357e-06, + "loss": 0.603, + "step": 6705 + }, + { + "epoch": 2.4755107063746, + "grad_norm": 0.8751606345176697, + "learning_rate": 3.2043313921035747e-06, + "loss": 0.6238, + "step": 6706 + }, + { + "epoch": 2.4758798917056364, + "grad_norm": 0.8739036917686462, + "learning_rate": 3.2038647054929928e-06, + "loss": 0.5928, + "step": 6707 + }, + { + "epoch": 2.4762490770366723, + "grad_norm": 0.8652421832084656, + "learning_rate": 3.2033979922413544e-06, + "loss": 0.5917, + "step": 6708 + }, + { + "epoch": 2.4766182623677087, + "grad_norm": 0.8905410766601562, + "learning_rate": 3.202931252366324e-06, + "loss": 0.6264, + "step": 6709 + }, + { + "epoch": 2.476987447698745, + "grad_norm": 0.8933557271957397, + "learning_rate": 3.202464485885568e-06, + "loss": 0.6645, + "step": 6710 + }, + { + "epoch": 2.477356633029781, + "grad_norm": 0.8843584656715393, + "learning_rate": 3.2019976928167516e-06, + "loss": 0.6385, + "step": 6711 + }, + { + "epoch": 2.4777258183608173, + "grad_norm": 0.928109347820282, + "learning_rate": 3.201530873177545e-06, + "loss": 0.6428, + "step": 6712 + }, + { + "epoch": 2.478095003691853, + "grad_norm": 0.8788700103759766, + "learning_rate": 3.2010640269856163e-06, + "loss": 0.6229, + "step": 6713 + }, + { + "epoch": 2.4784641890228896, + "grad_norm": 0.8660899996757507, + "learning_rate": 3.2005971542586355e-06, + "loss": 0.5981, + "step": 6714 + }, + { + "epoch": 2.4788333743539255, + "grad_norm": 0.8899818658828735, + "learning_rate": 3.200130255014273e-06, + "loss": 0.6432, + "step": 6715 + }, + { + "epoch": 2.479202559684962, + "grad_norm": 0.9046580791473389, + "learning_rate": 3.1996633292702013e-06, + "loss": 0.6081, + "step": 6716 + }, + { + "epoch": 2.479571745015998, + "grad_norm": 0.8901419043540955, + "learning_rate": 3.199196377044094e-06, + "loss": 0.5993, + "step": 6717 + }, + { + "epoch": 2.479940930347034, + "grad_norm": 0.8801125288009644, + "learning_rate": 3.1987293983536245e-06, + "loss": 0.6205, + "step": 6718 + }, + { + "epoch": 2.4803101156780705, + "grad_norm": 0.8779366612434387, + "learning_rate": 3.1982623932164675e-06, + "loss": 0.6152, + "step": 6719 + }, + { + "epoch": 2.4806793010091064, + "grad_norm": 0.8864232897758484, + "learning_rate": 3.1977953616502995e-06, + "loss": 0.6065, + "step": 6720 + }, + { + "epoch": 2.4810484863401427, + "grad_norm": 0.9088690876960754, + "learning_rate": 3.1973283036727963e-06, + "loss": 0.6038, + "step": 6721 + }, + { + "epoch": 2.481417671671179, + "grad_norm": 0.8922807574272156, + "learning_rate": 3.196861219301638e-06, + "loss": 0.6285, + "step": 6722 + }, + { + "epoch": 2.481786857002215, + "grad_norm": 0.931229293346405, + "learning_rate": 3.1963941085545023e-06, + "loss": 0.6195, + "step": 6723 + }, + { + "epoch": 2.4821560423332514, + "grad_norm": 0.8770807385444641, + "learning_rate": 3.195926971449069e-06, + "loss": 0.606, + "step": 6724 + }, + { + "epoch": 2.4825252276642873, + "grad_norm": 0.8915525078773499, + "learning_rate": 3.1954598080030197e-06, + "loss": 0.6132, + "step": 6725 + }, + { + "epoch": 2.4828944129953237, + "grad_norm": 0.88948655128479, + "learning_rate": 3.1949926182340356e-06, + "loss": 0.6227, + "step": 6726 + }, + { + "epoch": 2.48326359832636, + "grad_norm": 0.9030357003211975, + "learning_rate": 3.1945254021598018e-06, + "loss": 0.6306, + "step": 6727 + }, + { + "epoch": 2.483632783657396, + "grad_norm": 0.8930654525756836, + "learning_rate": 3.194058159797999e-06, + "loss": 0.6084, + "step": 6728 + }, + { + "epoch": 2.4840019689884323, + "grad_norm": 0.898191511631012, + "learning_rate": 3.193590891166315e-06, + "loss": 0.5931, + "step": 6729 + }, + { + "epoch": 2.484371154319468, + "grad_norm": 0.8645055294036865, + "learning_rate": 3.193123596282434e-06, + "loss": 0.5917, + "step": 6730 + }, + { + "epoch": 2.4847403396505046, + "grad_norm": 0.8932679295539856, + "learning_rate": 3.192656275164044e-06, + "loss": 0.6036, + "step": 6731 + }, + { + "epoch": 2.485109524981541, + "grad_norm": 0.9037833213806152, + "learning_rate": 3.192188927828832e-06, + "loss": 0.6257, + "step": 6732 + }, + { + "epoch": 2.485478710312577, + "grad_norm": 0.8611756563186646, + "learning_rate": 3.191721554294489e-06, + "loss": 0.6313, + "step": 6733 + }, + { + "epoch": 2.485847895643613, + "grad_norm": 0.8974111080169678, + "learning_rate": 3.1912541545787024e-06, + "loss": 0.6275, + "step": 6734 + }, + { + "epoch": 2.486217080974649, + "grad_norm": 0.8683516383171082, + "learning_rate": 3.190786728699164e-06, + "loss": 0.5746, + "step": 6735 + }, + { + "epoch": 2.4865862663056855, + "grad_norm": 0.89356929063797, + "learning_rate": 3.190319276673566e-06, + "loss": 0.5975, + "step": 6736 + }, + { + "epoch": 2.486955451636722, + "grad_norm": 0.8910592794418335, + "learning_rate": 3.189851798519602e-06, + "loss": 0.6496, + "step": 6737 + }, + { + "epoch": 2.4873246369677577, + "grad_norm": 0.8907511830329895, + "learning_rate": 3.189384294254965e-06, + "loss": 0.6374, + "step": 6738 + }, + { + "epoch": 2.487693822298794, + "grad_norm": 0.8954175114631653, + "learning_rate": 3.188916763897349e-06, + "loss": 0.6215, + "step": 6739 + }, + { + "epoch": 2.48806300762983, + "grad_norm": 0.9006734490394592, + "learning_rate": 3.188449207464451e-06, + "loss": 0.6343, + "step": 6740 + }, + { + "epoch": 2.4884321929608664, + "grad_norm": 0.9147834181785583, + "learning_rate": 3.1879816249739694e-06, + "loss": 0.568, + "step": 6741 + }, + { + "epoch": 2.4888013782919023, + "grad_norm": 0.8663974404335022, + "learning_rate": 3.187514016443599e-06, + "loss": 0.5806, + "step": 6742 + }, + { + "epoch": 2.4891705636229386, + "grad_norm": 0.9094536304473877, + "learning_rate": 3.1870463818910407e-06, + "loss": 0.5931, + "step": 6743 + }, + { + "epoch": 2.489539748953975, + "grad_norm": 0.8948284983634949, + "learning_rate": 3.1865787213339926e-06, + "loss": 0.5848, + "step": 6744 + }, + { + "epoch": 2.489908934285011, + "grad_norm": 0.8585731387138367, + "learning_rate": 3.186111034790158e-06, + "loss": 0.6278, + "step": 6745 + }, + { + "epoch": 2.4902781196160473, + "grad_norm": 0.9005891680717468, + "learning_rate": 3.1856433222772363e-06, + "loss": 0.6121, + "step": 6746 + }, + { + "epoch": 2.4906473049470836, + "grad_norm": 0.8866665363311768, + "learning_rate": 3.1851755838129323e-06, + "loss": 0.5914, + "step": 6747 + }, + { + "epoch": 2.4910164902781196, + "grad_norm": 0.872961163520813, + "learning_rate": 3.184707819414948e-06, + "loss": 0.6147, + "step": 6748 + }, + { + "epoch": 2.491385675609156, + "grad_norm": 0.9151605367660522, + "learning_rate": 3.184240029100989e-06, + "loss": 0.6356, + "step": 6749 + }, + { + "epoch": 2.491754860940192, + "grad_norm": 0.8744645118713379, + "learning_rate": 3.1837722128887607e-06, + "loss": 0.6048, + "step": 6750 + }, + { + "epoch": 2.492124046271228, + "grad_norm": 0.8879474401473999, + "learning_rate": 3.18330437079597e-06, + "loss": 0.6069, + "step": 6751 + }, + { + "epoch": 2.492493231602264, + "grad_norm": 0.9066160917282104, + "learning_rate": 3.182836502840324e-06, + "loss": 0.6306, + "step": 6752 + }, + { + "epoch": 2.4928624169333005, + "grad_norm": 0.8924762010574341, + "learning_rate": 3.1823686090395333e-06, + "loss": 0.6201, + "step": 6753 + }, + { + "epoch": 2.493231602264337, + "grad_norm": 0.9108995795249939, + "learning_rate": 3.181900689411304e-06, + "loss": 0.6077, + "step": 6754 + }, + { + "epoch": 2.4936007875953727, + "grad_norm": 0.9085102081298828, + "learning_rate": 3.1814327439733506e-06, + "loss": 0.6074, + "step": 6755 + }, + { + "epoch": 2.493969972926409, + "grad_norm": 0.9040588736534119, + "learning_rate": 3.180964772743382e-06, + "loss": 0.6184, + "step": 6756 + }, + { + "epoch": 2.4943391582574455, + "grad_norm": 0.8805028200149536, + "learning_rate": 3.1804967757391126e-06, + "loss": 0.6153, + "step": 6757 + }, + { + "epoch": 2.4947083435884814, + "grad_norm": 0.9068371057510376, + "learning_rate": 3.1800287529782538e-06, + "loss": 0.5932, + "step": 6758 + }, + { + "epoch": 2.4950775289195177, + "grad_norm": 0.8555784225463867, + "learning_rate": 3.179560704478522e-06, + "loss": 0.583, + "step": 6759 + }, + { + "epoch": 2.4954467142505536, + "grad_norm": 0.8982033133506775, + "learning_rate": 3.1790926302576318e-06, + "loss": 0.6448, + "step": 6760 + }, + { + "epoch": 2.49581589958159, + "grad_norm": 1.076346755027771, + "learning_rate": 3.1786245303333e-06, + "loss": 0.6373, + "step": 6761 + }, + { + "epoch": 2.496185084912626, + "grad_norm": 0.9038366675376892, + "learning_rate": 3.178156404723244e-06, + "loss": 0.6564, + "step": 6762 + }, + { + "epoch": 2.4965542702436623, + "grad_norm": 0.8670744299888611, + "learning_rate": 3.177688253445182e-06, + "loss": 0.5907, + "step": 6763 + }, + { + "epoch": 2.4969234555746986, + "grad_norm": 0.8562642335891724, + "learning_rate": 3.177220076516833e-06, + "loss": 0.6377, + "step": 6764 + }, + { + "epoch": 2.4972926409057346, + "grad_norm": 0.9102648496627808, + "learning_rate": 3.176751873955919e-06, + "loss": 0.6403, + "step": 6765 + }, + { + "epoch": 2.497661826236771, + "grad_norm": 0.8859831690788269, + "learning_rate": 3.1762836457801595e-06, + "loss": 0.6335, + "step": 6766 + }, + { + "epoch": 2.4980310115678073, + "grad_norm": 0.9041801691055298, + "learning_rate": 3.175815392007278e-06, + "loss": 0.6001, + "step": 6767 + }, + { + "epoch": 2.498400196898843, + "grad_norm": 0.9056479334831238, + "learning_rate": 3.175347112654995e-06, + "loss": 0.5967, + "step": 6768 + }, + { + "epoch": 2.4987693822298795, + "grad_norm": 0.8853076100349426, + "learning_rate": 3.1748788077410392e-06, + "loss": 0.5905, + "step": 6769 + }, + { + "epoch": 2.4991385675609155, + "grad_norm": 0.8785825967788696, + "learning_rate": 3.174410477283133e-06, + "loss": 0.6119, + "step": 6770 + }, + { + "epoch": 2.499507752891952, + "grad_norm": 0.8574090003967285, + "learning_rate": 3.173942121299003e-06, + "loss": 0.6025, + "step": 6771 + }, + { + "epoch": 2.4998769382229877, + "grad_norm": 0.8916663527488708, + "learning_rate": 3.1734737398063765e-06, + "loss": 0.6161, + "step": 6772 + }, + { + "epoch": 2.500246123554024, + "grad_norm": 0.9171807169914246, + "learning_rate": 3.173005332822981e-06, + "loss": 0.588, + "step": 6773 + }, + { + "epoch": 2.5006153088850605, + "grad_norm": 0.8795968890190125, + "learning_rate": 3.1725369003665464e-06, + "loss": 0.5939, + "step": 6774 + }, + { + "epoch": 2.5009844942160964, + "grad_norm": 0.86720210313797, + "learning_rate": 3.1720684424548027e-06, + "loss": 0.6132, + "step": 6775 + }, + { + "epoch": 2.5013536795471327, + "grad_norm": 0.9178309440612793, + "learning_rate": 3.1715999591054804e-06, + "loss": 0.6298, + "step": 6776 + }, + { + "epoch": 2.501722864878169, + "grad_norm": 0.8526558876037598, + "learning_rate": 3.171131450336311e-06, + "loss": 0.5719, + "step": 6777 + }, + { + "epoch": 2.502092050209205, + "grad_norm": 0.8774260878562927, + "learning_rate": 3.170662916165028e-06, + "loss": 0.6141, + "step": 6778 + }, + { + "epoch": 2.5024612355402414, + "grad_norm": 0.8887192606925964, + "learning_rate": 3.170194356609366e-06, + "loss": 0.587, + "step": 6779 + }, + { + "epoch": 2.5028304208712773, + "grad_norm": 0.9283472895622253, + "learning_rate": 3.169725771687059e-06, + "loss": 0.6025, + "step": 6780 + }, + { + "epoch": 2.5031996062023136, + "grad_norm": 0.882394552230835, + "learning_rate": 3.169257161415843e-06, + "loss": 0.6138, + "step": 6781 + }, + { + "epoch": 2.5035687915333495, + "grad_norm": 0.8981188535690308, + "learning_rate": 3.1687885258134543e-06, + "loss": 0.6148, + "step": 6782 + }, + { + "epoch": 2.503937976864386, + "grad_norm": 0.8716127276420593, + "learning_rate": 3.168319864897631e-06, + "loss": 0.6531, + "step": 6783 + }, + { + "epoch": 2.5043071621954223, + "grad_norm": 0.8884944915771484, + "learning_rate": 3.1678511786861123e-06, + "loss": 0.5714, + "step": 6784 + }, + { + "epoch": 2.504676347526458, + "grad_norm": 0.9061616659164429, + "learning_rate": 3.167382467196637e-06, + "loss": 0.6487, + "step": 6785 + }, + { + "epoch": 2.5050455328574945, + "grad_norm": 0.8771594762802124, + "learning_rate": 3.1669137304469456e-06, + "loss": 0.576, + "step": 6786 + }, + { + "epoch": 2.505414718188531, + "grad_norm": 0.8738110065460205, + "learning_rate": 3.16644496845478e-06, + "loss": 0.5973, + "step": 6787 + }, + { + "epoch": 2.505783903519567, + "grad_norm": 0.8941686153411865, + "learning_rate": 3.1659761812378826e-06, + "loss": 0.6486, + "step": 6788 + }, + { + "epoch": 2.5061530888506027, + "grad_norm": 0.8962488174438477, + "learning_rate": 3.1655073688139975e-06, + "loss": 0.5827, + "step": 6789 + }, + { + "epoch": 2.506522274181639, + "grad_norm": 0.8682740330696106, + "learning_rate": 3.165038531200868e-06, + "loss": 0.6266, + "step": 6790 + }, + { + "epoch": 2.5068914595126754, + "grad_norm": 0.9381815195083618, + "learning_rate": 3.1645696684162407e-06, + "loss": 0.6279, + "step": 6791 + }, + { + "epoch": 2.5072606448437114, + "grad_norm": 0.9155692458152771, + "learning_rate": 3.16410078047786e-06, + "loss": 0.6428, + "step": 6792 + }, + { + "epoch": 2.5076298301747477, + "grad_norm": 0.8877423405647278, + "learning_rate": 3.163631867403475e-06, + "loss": 0.6078, + "step": 6793 + }, + { + "epoch": 2.507999015505784, + "grad_norm": 0.8829679489135742, + "learning_rate": 3.1631629292108334e-06, + "loss": 0.6224, + "step": 6794 + }, + { + "epoch": 2.50836820083682, + "grad_norm": 0.8742494583129883, + "learning_rate": 3.1626939659176842e-06, + "loss": 0.6152, + "step": 6795 + }, + { + "epoch": 2.5087373861678564, + "grad_norm": 0.8868880867958069, + "learning_rate": 3.1622249775417774e-06, + "loss": 0.6259, + "step": 6796 + }, + { + "epoch": 2.5091065714988927, + "grad_norm": 0.8796166181564331, + "learning_rate": 3.1617559641008646e-06, + "loss": 0.6023, + "step": 6797 + }, + { + "epoch": 2.5094757568299286, + "grad_norm": 0.8896876573562622, + "learning_rate": 3.1612869256126966e-06, + "loss": 0.6333, + "step": 6798 + }, + { + "epoch": 2.5098449421609645, + "grad_norm": 0.8852813243865967, + "learning_rate": 3.160817862095028e-06, + "loss": 0.64, + "step": 6799 + }, + { + "epoch": 2.510214127492001, + "grad_norm": 0.8848188519477844, + "learning_rate": 3.160348773565611e-06, + "loss": 0.5915, + "step": 6800 + }, + { + "epoch": 2.5105833128230373, + "grad_norm": 0.8882603049278259, + "learning_rate": 3.1598796600422017e-06, + "loss": 0.5845, + "step": 6801 + }, + { + "epoch": 2.510952498154073, + "grad_norm": 0.86459881067276, + "learning_rate": 3.1594105215425557e-06, + "loss": 0.6173, + "step": 6802 + }, + { + "epoch": 2.5113216834851095, + "grad_norm": 0.9125607013702393, + "learning_rate": 3.1589413580844295e-06, + "loss": 0.6378, + "step": 6803 + }, + { + "epoch": 2.511690868816146, + "grad_norm": 0.9124771356582642, + "learning_rate": 3.1584721696855813e-06, + "loss": 0.6114, + "step": 6804 + }, + { + "epoch": 2.512060054147182, + "grad_norm": 0.8801177740097046, + "learning_rate": 3.158002956363769e-06, + "loss": 0.6101, + "step": 6805 + }, + { + "epoch": 2.512429239478218, + "grad_norm": 0.9025785326957703, + "learning_rate": 3.157533718136752e-06, + "loss": 0.6237, + "step": 6806 + }, + { + "epoch": 2.512798424809254, + "grad_norm": 0.9247942566871643, + "learning_rate": 3.157064455022292e-06, + "loss": 0.6071, + "step": 6807 + }, + { + "epoch": 2.5131676101402904, + "grad_norm": 0.8922885656356812, + "learning_rate": 3.156595167038149e-06, + "loss": 0.6806, + "step": 6808 + }, + { + "epoch": 2.5135367954713264, + "grad_norm": 0.8849051594734192, + "learning_rate": 3.1561258542020865e-06, + "loss": 0.6348, + "step": 6809 + }, + { + "epoch": 2.5139059808023627, + "grad_norm": 0.9036447405815125, + "learning_rate": 3.1556565165318676e-06, + "loss": 0.6733, + "step": 6810 + }, + { + "epoch": 2.514275166133399, + "grad_norm": 0.873706042766571, + "learning_rate": 3.155187154045256e-06, + "loss": 0.6407, + "step": 6811 + }, + { + "epoch": 2.514644351464435, + "grad_norm": 0.8981397747993469, + "learning_rate": 3.154717766760018e-06, + "loss": 0.6015, + "step": 6812 + }, + { + "epoch": 2.5150135367954713, + "grad_norm": 0.8727259635925293, + "learning_rate": 3.1542483546939195e-06, + "loss": 0.6182, + "step": 6813 + }, + { + "epoch": 2.5153827221265077, + "grad_norm": 0.8755634427070618, + "learning_rate": 3.1537789178647265e-06, + "loss": 0.5986, + "step": 6814 + }, + { + "epoch": 2.5157519074575436, + "grad_norm": 0.9286865592002869, + "learning_rate": 3.153309456290208e-06, + "loss": 0.6124, + "step": 6815 + }, + { + "epoch": 2.51612109278858, + "grad_norm": 0.9308950901031494, + "learning_rate": 3.1528399699881327e-06, + "loss": 0.6526, + "step": 6816 + }, + { + "epoch": 2.516490278119616, + "grad_norm": 0.8886969685554504, + "learning_rate": 3.152370458976271e-06, + "loss": 0.6048, + "step": 6817 + }, + { + "epoch": 2.5168594634506523, + "grad_norm": 0.877002477645874, + "learning_rate": 3.151900923272393e-06, + "loss": 0.6094, + "step": 6818 + }, + { + "epoch": 2.517228648781688, + "grad_norm": 1.1780617237091064, + "learning_rate": 3.151431362894271e-06, + "loss": 0.6029, + "step": 6819 + }, + { + "epoch": 2.5175978341127245, + "grad_norm": 0.8832935094833374, + "learning_rate": 3.1509617778596767e-06, + "loss": 0.6479, + "step": 6820 + }, + { + "epoch": 2.517967019443761, + "grad_norm": 0.9003704786300659, + "learning_rate": 3.1504921681863855e-06, + "loss": 0.6415, + "step": 6821 + }, + { + "epoch": 2.518336204774797, + "grad_norm": 0.8786479234695435, + "learning_rate": 3.1500225338921703e-06, + "loss": 0.6103, + "step": 6822 + }, + { + "epoch": 2.518705390105833, + "grad_norm": 0.9174721240997314, + "learning_rate": 3.1495528749948083e-06, + "loss": 0.5993, + "step": 6823 + }, + { + "epoch": 2.5190745754368695, + "grad_norm": 0.8791833519935608, + "learning_rate": 3.1490831915120744e-06, + "loss": 0.6228, + "step": 6824 + }, + { + "epoch": 2.5194437607679054, + "grad_norm": 0.9193525910377502, + "learning_rate": 3.1486134834617466e-06, + "loss": 0.654, + "step": 6825 + }, + { + "epoch": 2.519812946098942, + "grad_norm": 0.8942224383354187, + "learning_rate": 3.1481437508616033e-06, + "loss": 0.6619, + "step": 6826 + }, + { + "epoch": 2.5201821314299777, + "grad_norm": 0.8958398103713989, + "learning_rate": 3.1476739937294232e-06, + "loss": 0.6176, + "step": 6827 + }, + { + "epoch": 2.520551316761014, + "grad_norm": 0.9023460149765015, + "learning_rate": 3.147204212082987e-06, + "loss": 0.5965, + "step": 6828 + }, + { + "epoch": 2.52092050209205, + "grad_norm": 0.8862528204917908, + "learning_rate": 3.146734405940076e-06, + "loss": 0.5875, + "step": 6829 + }, + { + "epoch": 2.5212896874230863, + "grad_norm": 0.8919878005981445, + "learning_rate": 3.146264575318471e-06, + "loss": 0.621, + "step": 6830 + }, + { + "epoch": 2.5216588727541227, + "grad_norm": 0.8840134143829346, + "learning_rate": 3.145794720235956e-06, + "loss": 0.6034, + "step": 6831 + }, + { + "epoch": 2.5220280580851586, + "grad_norm": 0.901562511920929, + "learning_rate": 3.1453248407103156e-06, + "loss": 0.6309, + "step": 6832 + }, + { + "epoch": 2.522397243416195, + "grad_norm": 0.8849226236343384, + "learning_rate": 3.144854936759333e-06, + "loss": 0.6192, + "step": 6833 + }, + { + "epoch": 2.5227664287472313, + "grad_norm": 0.9093470573425293, + "learning_rate": 3.144385008400794e-06, + "loss": 0.5957, + "step": 6834 + }, + { + "epoch": 2.5231356140782673, + "grad_norm": 0.8857989311218262, + "learning_rate": 3.1439150556524866e-06, + "loss": 0.579, + "step": 6835 + }, + { + "epoch": 2.5235047994093036, + "grad_norm": 0.884914219379425, + "learning_rate": 3.143445078532198e-06, + "loss": 0.6172, + "step": 6836 + }, + { + "epoch": 2.5238739847403395, + "grad_norm": 0.9314041137695312, + "learning_rate": 3.1429750770577155e-06, + "loss": 0.6251, + "step": 6837 + }, + { + "epoch": 2.524243170071376, + "grad_norm": 0.8730660080909729, + "learning_rate": 3.142505051246829e-06, + "loss": 0.596, + "step": 6838 + }, + { + "epoch": 2.524612355402412, + "grad_norm": 0.8932158350944519, + "learning_rate": 3.1420350011173297e-06, + "loss": 0.6191, + "step": 6839 + }, + { + "epoch": 2.524981540733448, + "grad_norm": 0.8924177885055542, + "learning_rate": 3.141564926687008e-06, + "loss": 0.5909, + "step": 6840 + }, + { + "epoch": 2.5253507260644845, + "grad_norm": 0.8784066438674927, + "learning_rate": 3.1410948279736566e-06, + "loss": 0.6155, + "step": 6841 + }, + { + "epoch": 2.5257199113955204, + "grad_norm": 0.8724591732025146, + "learning_rate": 3.140624704995068e-06, + "loss": 0.5892, + "step": 6842 + }, + { + "epoch": 2.526089096726557, + "grad_norm": 0.891928493976593, + "learning_rate": 3.1401545577690363e-06, + "loss": 0.6385, + "step": 6843 + }, + { + "epoch": 2.526458282057593, + "grad_norm": 0.8823627829551697, + "learning_rate": 3.1396843863133565e-06, + "loss": 0.5827, + "step": 6844 + }, + { + "epoch": 2.526827467388629, + "grad_norm": 0.8558559417724609, + "learning_rate": 3.1392141906458253e-06, + "loss": 0.6153, + "step": 6845 + }, + { + "epoch": 2.5271966527196654, + "grad_norm": 0.8886880874633789, + "learning_rate": 3.1387439707842383e-06, + "loss": 0.6358, + "step": 6846 + }, + { + "epoch": 2.5275658380507013, + "grad_norm": 0.8525471091270447, + "learning_rate": 3.1382737267463934e-06, + "loss": 0.6165, + "step": 6847 + }, + { + "epoch": 2.5279350233817377, + "grad_norm": 0.9161831736564636, + "learning_rate": 3.1378034585500893e-06, + "loss": 0.6346, + "step": 6848 + }, + { + "epoch": 2.5283042087127736, + "grad_norm": 0.9034373760223389, + "learning_rate": 3.1373331662131256e-06, + "loss": 0.6629, + "step": 6849 + }, + { + "epoch": 2.52867339404381, + "grad_norm": 0.9108402729034424, + "learning_rate": 3.136862849753304e-06, + "loss": 0.6472, + "step": 6850 + }, + { + "epoch": 2.5290425793748463, + "grad_norm": 0.9000054001808167, + "learning_rate": 3.136392509188423e-06, + "loss": 0.6398, + "step": 6851 + }, + { + "epoch": 2.5294117647058822, + "grad_norm": 0.901006281375885, + "learning_rate": 3.135922144536287e-06, + "loss": 0.6595, + "step": 6852 + }, + { + "epoch": 2.5297809500369186, + "grad_norm": 0.918697714805603, + "learning_rate": 3.1354517558146978e-06, + "loss": 0.6166, + "step": 6853 + }, + { + "epoch": 2.530150135367955, + "grad_norm": 0.8815058469772339, + "learning_rate": 3.1349813430414617e-06, + "loss": 0.6004, + "step": 6854 + }, + { + "epoch": 2.530519320698991, + "grad_norm": 0.8732578754425049, + "learning_rate": 3.134510906234381e-06, + "loss": 0.6166, + "step": 6855 + }, + { + "epoch": 2.530888506030027, + "grad_norm": 0.9023682475090027, + "learning_rate": 3.1340404454112633e-06, + "loss": 0.6367, + "step": 6856 + }, + { + "epoch": 2.531257691361063, + "grad_norm": 0.8636946082115173, + "learning_rate": 3.1335699605899146e-06, + "loss": 0.6108, + "step": 6857 + }, + { + "epoch": 2.5316268766920995, + "grad_norm": 0.9007062911987305, + "learning_rate": 3.1330994517881425e-06, + "loss": 0.6118, + "step": 6858 + }, + { + "epoch": 2.5319960620231354, + "grad_norm": 0.8733677268028259, + "learning_rate": 3.132628919023756e-06, + "loss": 0.6128, + "step": 6859 + }, + { + "epoch": 2.532365247354172, + "grad_norm": 0.8874294757843018, + "learning_rate": 3.132158362314566e-06, + "loss": 0.6048, + "step": 6860 + }, + { + "epoch": 2.532734432685208, + "grad_norm": 0.9132538437843323, + "learning_rate": 3.13168778167838e-06, + "loss": 0.6035, + "step": 6861 + }, + { + "epoch": 2.533103618016244, + "grad_norm": 0.915969729423523, + "learning_rate": 3.1312171771330106e-06, + "loss": 0.6507, + "step": 6862 + }, + { + "epoch": 2.5334728033472804, + "grad_norm": 0.9235036373138428, + "learning_rate": 3.1307465486962706e-06, + "loss": 0.6259, + "step": 6863 + }, + { + "epoch": 2.533841988678317, + "grad_norm": 0.8898342847824097, + "learning_rate": 3.1302758963859736e-06, + "loss": 0.6447, + "step": 6864 + }, + { + "epoch": 2.5342111740093527, + "grad_norm": 0.9181999564170837, + "learning_rate": 3.1298052202199325e-06, + "loss": 0.6013, + "step": 6865 + }, + { + "epoch": 2.5345803593403886, + "grad_norm": 0.8914423584938049, + "learning_rate": 3.1293345202159626e-06, + "loss": 0.6008, + "step": 6866 + }, + { + "epoch": 2.534949544671425, + "grad_norm": 0.8950778245925903, + "learning_rate": 3.1288637963918796e-06, + "loss": 0.5984, + "step": 6867 + }, + { + "epoch": 2.5353187300024613, + "grad_norm": 0.8914331793785095, + "learning_rate": 3.1283930487655003e-06, + "loss": 0.6198, + "step": 6868 + }, + { + "epoch": 2.5356879153334972, + "grad_norm": 0.8817681074142456, + "learning_rate": 3.127922277354642e-06, + "loss": 0.6305, + "step": 6869 + }, + { + "epoch": 2.5360571006645336, + "grad_norm": 0.9226016998291016, + "learning_rate": 3.127451482177124e-06, + "loss": 0.615, + "step": 6870 + }, + { + "epoch": 2.53642628599557, + "grad_norm": 0.9092953205108643, + "learning_rate": 3.1269806632507655e-06, + "loss": 0.6135, + "step": 6871 + }, + { + "epoch": 2.536795471326606, + "grad_norm": 0.8952040076255798, + "learning_rate": 3.126509820593387e-06, + "loss": 0.6082, + "step": 6872 + }, + { + "epoch": 2.5371646566576422, + "grad_norm": 0.9027245044708252, + "learning_rate": 3.1260389542228085e-06, + "loss": 0.6296, + "step": 6873 + }, + { + "epoch": 2.537533841988678, + "grad_norm": 0.8626078963279724, + "learning_rate": 3.125568064156855e-06, + "loss": 0.6105, + "step": 6874 + }, + { + "epoch": 2.5379030273197145, + "grad_norm": 0.8885278105735779, + "learning_rate": 3.1250971504133466e-06, + "loss": 0.5857, + "step": 6875 + }, + { + "epoch": 2.5382722126507504, + "grad_norm": 0.8942751884460449, + "learning_rate": 3.1246262130101083e-06, + "loss": 0.6165, + "step": 6876 + }, + { + "epoch": 2.538641397981787, + "grad_norm": 0.9135850071907043, + "learning_rate": 3.124155251964965e-06, + "loss": 0.6541, + "step": 6877 + }, + { + "epoch": 2.539010583312823, + "grad_norm": 0.9007644057273865, + "learning_rate": 3.1236842672957425e-06, + "loss": 0.66, + "step": 6878 + }, + { + "epoch": 2.539379768643859, + "grad_norm": 0.8892017602920532, + "learning_rate": 3.123213259020267e-06, + "loss": 0.5946, + "step": 6879 + }, + { + "epoch": 2.5397489539748954, + "grad_norm": 0.8907625675201416, + "learning_rate": 3.1227422271563675e-06, + "loss": 0.6034, + "step": 6880 + }, + { + "epoch": 2.5401181393059318, + "grad_norm": 0.8871347904205322, + "learning_rate": 3.12227117172187e-06, + "loss": 0.6149, + "step": 6881 + }, + { + "epoch": 2.5404873246369677, + "grad_norm": 0.8568869829177856, + "learning_rate": 3.1218000927346052e-06, + "loss": 0.6028, + "step": 6882 + }, + { + "epoch": 2.540856509968004, + "grad_norm": 0.8834139704704285, + "learning_rate": 3.1213289902124033e-06, + "loss": 0.623, + "step": 6883 + }, + { + "epoch": 2.54122569529904, + "grad_norm": 0.8660569190979004, + "learning_rate": 3.120857864173096e-06, + "loss": 0.6037, + "step": 6884 + }, + { + "epoch": 2.5415948806300763, + "grad_norm": 0.8788653016090393, + "learning_rate": 3.1203867146345133e-06, + "loss": 0.6129, + "step": 6885 + }, + { + "epoch": 2.5419640659611122, + "grad_norm": 0.9132794141769409, + "learning_rate": 3.11991554161449e-06, + "loss": 0.6311, + "step": 6886 + }, + { + "epoch": 2.5423332512921486, + "grad_norm": 0.9184549450874329, + "learning_rate": 3.119444345130858e-06, + "loss": 0.6382, + "step": 6887 + }, + { + "epoch": 2.542702436623185, + "grad_norm": 0.8901375532150269, + "learning_rate": 3.1189731252014544e-06, + "loss": 0.5762, + "step": 6888 + }, + { + "epoch": 2.543071621954221, + "grad_norm": 0.8939695358276367, + "learning_rate": 3.1185018818441126e-06, + "loss": 0.6361, + "step": 6889 + }, + { + "epoch": 2.5434408072852572, + "grad_norm": 0.8785338997840881, + "learning_rate": 3.1180306150766703e-06, + "loss": 0.6285, + "step": 6890 + }, + { + "epoch": 2.5438099926162936, + "grad_norm": 0.8970841765403748, + "learning_rate": 3.1175593249169634e-06, + "loss": 0.6466, + "step": 6891 + }, + { + "epoch": 2.5441791779473295, + "grad_norm": 0.8865100145339966, + "learning_rate": 3.117088011382832e-06, + "loss": 0.617, + "step": 6892 + }, + { + "epoch": 2.544548363278366, + "grad_norm": 0.8713051676750183, + "learning_rate": 3.1166166744921135e-06, + "loss": 0.5952, + "step": 6893 + }, + { + "epoch": 2.544917548609402, + "grad_norm": 0.9841116070747375, + "learning_rate": 3.116145314262649e-06, + "loss": 0.637, + "step": 6894 + }, + { + "epoch": 2.545286733940438, + "grad_norm": 0.8820594549179077, + "learning_rate": 3.115673930712278e-06, + "loss": 0.5813, + "step": 6895 + }, + { + "epoch": 2.545655919271474, + "grad_norm": 0.8683436512947083, + "learning_rate": 3.1152025238588433e-06, + "loss": 0.6048, + "step": 6896 + }, + { + "epoch": 2.5460251046025104, + "grad_norm": 0.8928249478340149, + "learning_rate": 3.1147310937201873e-06, + "loss": 0.5944, + "step": 6897 + }, + { + "epoch": 2.5463942899335468, + "grad_norm": 0.9013893008232117, + "learning_rate": 3.114259640314154e-06, + "loss": 0.6194, + "step": 6898 + }, + { + "epoch": 2.5467634752645827, + "grad_norm": 0.8522540330886841, + "learning_rate": 3.1137881636585864e-06, + "loss": 0.5936, + "step": 6899 + }, + { + "epoch": 2.547132660595619, + "grad_norm": 0.8695140480995178, + "learning_rate": 3.1133166637713314e-06, + "loss": 0.5999, + "step": 6900 + }, + { + "epoch": 2.5475018459266554, + "grad_norm": 0.8800789713859558, + "learning_rate": 3.112845140670233e-06, + "loss": 0.5985, + "step": 6901 + }, + { + "epoch": 2.5478710312576913, + "grad_norm": 0.8809757828712463, + "learning_rate": 3.1123735943731402e-06, + "loss": 0.607, + "step": 6902 + }, + { + "epoch": 2.5482402165887277, + "grad_norm": 0.888076663017273, + "learning_rate": 3.1119020248978998e-06, + "loss": 0.6163, + "step": 6903 + }, + { + "epoch": 2.5486094019197636, + "grad_norm": 0.9203510284423828, + "learning_rate": 3.111430432262362e-06, + "loss": 0.634, + "step": 6904 + }, + { + "epoch": 2.5489785872508, + "grad_norm": 0.8939334750175476, + "learning_rate": 3.110958816484374e-06, + "loss": 0.587, + "step": 6905 + }, + { + "epoch": 2.549347772581836, + "grad_norm": 0.9072867035865784, + "learning_rate": 3.110487177581788e-06, + "loss": 0.6043, + "step": 6906 + }, + { + "epoch": 2.5497169579128722, + "grad_norm": 0.9079605340957642, + "learning_rate": 3.110015515572455e-06, + "loss": 0.608, + "step": 6907 + }, + { + "epoch": 2.5500861432439086, + "grad_norm": 0.8820279240608215, + "learning_rate": 3.1095438304742275e-06, + "loss": 0.6019, + "step": 6908 + }, + { + "epoch": 2.5504553285749445, + "grad_norm": 0.8996363878250122, + "learning_rate": 3.109072122304958e-06, + "loss": 0.6161, + "step": 6909 + }, + { + "epoch": 2.550824513905981, + "grad_norm": 0.9019712209701538, + "learning_rate": 3.1086003910825006e-06, + "loss": 0.5928, + "step": 6910 + }, + { + "epoch": 2.551193699237017, + "grad_norm": 0.9149045348167419, + "learning_rate": 3.108128636824711e-06, + "loss": 0.628, + "step": 6911 + }, + { + "epoch": 2.551562884568053, + "grad_norm": 0.8650303483009338, + "learning_rate": 3.107656859549445e-06, + "loss": 0.6038, + "step": 6912 + }, + { + "epoch": 2.5519320698990895, + "grad_norm": 0.8897508382797241, + "learning_rate": 3.107185059274558e-06, + "loss": 0.6095, + "step": 6913 + }, + { + "epoch": 2.5523012552301254, + "grad_norm": 0.958580732345581, + "learning_rate": 3.106713236017908e-06, + "loss": 0.6546, + "step": 6914 + }, + { + "epoch": 2.5526704405611618, + "grad_norm": 0.8897517323493958, + "learning_rate": 3.1062413897973533e-06, + "loss": 0.5944, + "step": 6915 + }, + { + "epoch": 2.5530396258921977, + "grad_norm": 0.92780601978302, + "learning_rate": 3.1057695206307536e-06, + "loss": 0.6596, + "step": 6916 + }, + { + "epoch": 2.553408811223234, + "grad_norm": 0.8785426020622253, + "learning_rate": 3.1052976285359686e-06, + "loss": 0.614, + "step": 6917 + }, + { + "epoch": 2.5537779965542704, + "grad_norm": 0.8947378993034363, + "learning_rate": 3.10482571353086e-06, + "loss": 0.618, + "step": 6918 + }, + { + "epoch": 2.5541471818853063, + "grad_norm": 0.893308162689209, + "learning_rate": 3.104353775633289e-06, + "loss": 0.6225, + "step": 6919 + }, + { + "epoch": 2.5545163672163427, + "grad_norm": 0.8874735236167908, + "learning_rate": 3.1038818148611178e-06, + "loss": 0.6152, + "step": 6920 + }, + { + "epoch": 2.554885552547379, + "grad_norm": 0.8804842829704285, + "learning_rate": 3.1034098312322103e-06, + "loss": 0.6494, + "step": 6921 + }, + { + "epoch": 2.555254737878415, + "grad_norm": 0.8992946743965149, + "learning_rate": 3.102937824764432e-06, + "loss": 0.6383, + "step": 6922 + }, + { + "epoch": 2.5556239232094513, + "grad_norm": 0.868079423904419, + "learning_rate": 3.1024657954756463e-06, + "loss": 0.584, + "step": 6923 + }, + { + "epoch": 2.5559931085404872, + "grad_norm": 0.8997260332107544, + "learning_rate": 3.1019937433837216e-06, + "loss": 0.5901, + "step": 6924 + }, + { + "epoch": 2.5563622938715236, + "grad_norm": 0.8861425518989563, + "learning_rate": 3.101521668506522e-06, + "loss": 0.6016, + "step": 6925 + }, + { + "epoch": 2.5567314792025595, + "grad_norm": 0.9066355228424072, + "learning_rate": 3.1010495708619188e-06, + "loss": 0.6044, + "step": 6926 + }, + { + "epoch": 2.557100664533596, + "grad_norm": 0.8901737332344055, + "learning_rate": 3.1005774504677783e-06, + "loss": 0.6116, + "step": 6927 + }, + { + "epoch": 2.557469849864632, + "grad_norm": 0.8640279173851013, + "learning_rate": 3.1001053073419706e-06, + "loss": 0.6253, + "step": 6928 + }, + { + "epoch": 2.557839035195668, + "grad_norm": 0.8915820121765137, + "learning_rate": 3.099633141502366e-06, + "loss": 0.591, + "step": 6929 + }, + { + "epoch": 2.5582082205267045, + "grad_norm": 0.8842781186103821, + "learning_rate": 3.0991609529668374e-06, + "loss": 0.6409, + "step": 6930 + }, + { + "epoch": 2.558577405857741, + "grad_norm": 0.9514613151550293, + "learning_rate": 3.098688741753255e-06, + "loss": 0.6237, + "step": 6931 + }, + { + "epoch": 2.5589465911887768, + "grad_norm": 0.8793916702270508, + "learning_rate": 3.098216507879494e-06, + "loss": 0.5865, + "step": 6932 + }, + { + "epoch": 2.5593157765198127, + "grad_norm": 0.8980348706245422, + "learning_rate": 3.097744251363426e-06, + "loss": 0.6414, + "step": 6933 + }, + { + "epoch": 2.559684961850849, + "grad_norm": 0.929279088973999, + "learning_rate": 3.097271972222926e-06, + "loss": 0.5841, + "step": 6934 + }, + { + "epoch": 2.5600541471818854, + "grad_norm": 0.8971282839775085, + "learning_rate": 3.096799670475871e-06, + "loss": 0.6033, + "step": 6935 + }, + { + "epoch": 2.5604233325129213, + "grad_norm": 0.8941642642021179, + "learning_rate": 3.0963273461401376e-06, + "loss": 0.6131, + "step": 6936 + }, + { + "epoch": 2.5607925178439577, + "grad_norm": 0.8911741971969604, + "learning_rate": 3.0958549992336013e-06, + "loss": 0.631, + "step": 6937 + }, + { + "epoch": 2.561161703174994, + "grad_norm": 0.8763523697853088, + "learning_rate": 3.0953826297741424e-06, + "loss": 0.6242, + "step": 6938 + }, + { + "epoch": 2.56153088850603, + "grad_norm": 0.8892491459846497, + "learning_rate": 3.0949102377796375e-06, + "loss": 0.6208, + "step": 6939 + }, + { + "epoch": 2.5619000738370663, + "grad_norm": 0.916675329208374, + "learning_rate": 3.094437823267969e-06, + "loss": 0.5933, + "step": 6940 + }, + { + "epoch": 2.5622692591681027, + "grad_norm": 0.9086977243423462, + "learning_rate": 3.0939653862570156e-06, + "loss": 0.627, + "step": 6941 + }, + { + "epoch": 2.5626384444991386, + "grad_norm": 0.9266194105148315, + "learning_rate": 3.0934929267646608e-06, + "loss": 0.6123, + "step": 6942 + }, + { + "epoch": 2.5630076298301745, + "grad_norm": 0.885733962059021, + "learning_rate": 3.0930204448087847e-06, + "loss": 0.6279, + "step": 6943 + }, + { + "epoch": 2.563376815161211, + "grad_norm": 0.8759624361991882, + "learning_rate": 3.092547940407273e-06, + "loss": 0.6341, + "step": 6944 + }, + { + "epoch": 2.563746000492247, + "grad_norm": 0.9257074594497681, + "learning_rate": 3.0920754135780086e-06, + "loss": 0.6149, + "step": 6945 + }, + { + "epoch": 2.564115185823283, + "grad_norm": 0.8868311047554016, + "learning_rate": 3.0916028643388763e-06, + "loss": 0.5859, + "step": 6946 + }, + { + "epoch": 2.5644843711543195, + "grad_norm": 0.8944568037986755, + "learning_rate": 3.0911302927077626e-06, + "loss": 0.6064, + "step": 6947 + }, + { + "epoch": 2.564853556485356, + "grad_norm": 0.8810253739356995, + "learning_rate": 3.0906576987025532e-06, + "loss": 0.6171, + "step": 6948 + }, + { + "epoch": 2.5652227418163918, + "grad_norm": 0.8841821551322937, + "learning_rate": 3.0901850823411374e-06, + "loss": 0.5948, + "step": 6949 + }, + { + "epoch": 2.565591927147428, + "grad_norm": 0.89727783203125, + "learning_rate": 3.0897124436414023e-06, + "loss": 0.6267, + "step": 6950 + }, + { + "epoch": 2.565961112478464, + "grad_norm": 0.8749191164970398, + "learning_rate": 3.0892397826212365e-06, + "loss": 0.6111, + "step": 6951 + }, + { + "epoch": 2.5663302978095004, + "grad_norm": 0.8770114779472351, + "learning_rate": 3.088767099298532e-06, + "loss": 0.6275, + "step": 6952 + }, + { + "epoch": 2.5666994831405363, + "grad_norm": 0.8772600889205933, + "learning_rate": 3.088294393691177e-06, + "loss": 0.6176, + "step": 6953 + }, + { + "epoch": 2.5670686684715727, + "grad_norm": 0.8697948455810547, + "learning_rate": 3.0878216658170662e-06, + "loss": 0.5774, + "step": 6954 + }, + { + "epoch": 2.567437853802609, + "grad_norm": 0.8849372863769531, + "learning_rate": 3.0873489156940907e-06, + "loss": 0.6199, + "step": 6955 + }, + { + "epoch": 2.567807039133645, + "grad_norm": 0.8873183727264404, + "learning_rate": 3.0868761433401447e-06, + "loss": 0.6047, + "step": 6956 + }, + { + "epoch": 2.5681762244646813, + "grad_norm": 0.924704372882843, + "learning_rate": 3.0864033487731205e-06, + "loss": 0.599, + "step": 6957 + }, + { + "epoch": 2.5685454097957177, + "grad_norm": 0.8830130100250244, + "learning_rate": 3.085930532010915e-06, + "loss": 0.5965, + "step": 6958 + }, + { + "epoch": 2.5689145951267536, + "grad_norm": 0.9043479561805725, + "learning_rate": 3.085457693071424e-06, + "loss": 0.6093, + "step": 6959 + }, + { + "epoch": 2.56928378045779, + "grad_norm": 0.886027455329895, + "learning_rate": 3.0849848319725444e-06, + "loss": 0.5965, + "step": 6960 + }, + { + "epoch": 2.569652965788826, + "grad_norm": 0.8745759725570679, + "learning_rate": 3.0845119487321736e-06, + "loss": 0.5798, + "step": 6961 + }, + { + "epoch": 2.570022151119862, + "grad_norm": 0.8548250198364258, + "learning_rate": 3.0840390433682093e-06, + "loss": 0.5994, + "step": 6962 + }, + { + "epoch": 2.570391336450898, + "grad_norm": 0.894848108291626, + "learning_rate": 3.083566115898552e-06, + "loss": 0.6235, + "step": 6963 + }, + { + "epoch": 2.5707605217819345, + "grad_norm": 0.8892999887466431, + "learning_rate": 3.0830931663411016e-06, + "loss": 0.6228, + "step": 6964 + }, + { + "epoch": 2.571129707112971, + "grad_norm": 0.905491292476654, + "learning_rate": 3.0826201947137584e-06, + "loss": 0.5886, + "step": 6965 + }, + { + "epoch": 2.5714988924440068, + "grad_norm": 0.9078452587127686, + "learning_rate": 3.0821472010344257e-06, + "loss": 0.5793, + "step": 6966 + }, + { + "epoch": 2.571868077775043, + "grad_norm": 0.9298433661460876, + "learning_rate": 3.081674185321003e-06, + "loss": 0.6221, + "step": 6967 + }, + { + "epoch": 2.5722372631060795, + "grad_norm": 0.8843700289726257, + "learning_rate": 3.0812011475913983e-06, + "loss": 0.6085, + "step": 6968 + }, + { + "epoch": 2.5726064484371154, + "grad_norm": 0.881583571434021, + "learning_rate": 3.080728087863512e-06, + "loss": 0.5924, + "step": 6969 + }, + { + "epoch": 2.5729756337681517, + "grad_norm": 0.8689223527908325, + "learning_rate": 3.0802550061552523e-06, + "loss": 0.6143, + "step": 6970 + }, + { + "epoch": 2.5733448190991877, + "grad_norm": 0.8971377611160278, + "learning_rate": 3.0797819024845227e-06, + "loss": 0.6257, + "step": 6971 + }, + { + "epoch": 2.573714004430224, + "grad_norm": 0.9042449593544006, + "learning_rate": 3.079308776869231e-06, + "loss": 0.6182, + "step": 6972 + }, + { + "epoch": 2.57408318976126, + "grad_norm": 0.9085806012153625, + "learning_rate": 3.078835629327285e-06, + "loss": 0.6211, + "step": 6973 + }, + { + "epoch": 2.5744523750922963, + "grad_norm": 0.8613004088401794, + "learning_rate": 3.078362459876594e-06, + "loss": 0.5746, + "step": 6974 + }, + { + "epoch": 2.5748215604233327, + "grad_norm": 0.897172212600708, + "learning_rate": 3.0778892685350655e-06, + "loss": 0.6214, + "step": 6975 + }, + { + "epoch": 2.5751907457543686, + "grad_norm": 0.8775335550308228, + "learning_rate": 3.077416055320611e-06, + "loss": 0.6162, + "step": 6976 + }, + { + "epoch": 2.575559931085405, + "grad_norm": 0.8849306106567383, + "learning_rate": 3.0769428202511402e-06, + "loss": 0.6387, + "step": 6977 + }, + { + "epoch": 2.5759291164164413, + "grad_norm": 0.8955250978469849, + "learning_rate": 3.0764695633445674e-06, + "loss": 0.5672, + "step": 6978 + }, + { + "epoch": 2.576298301747477, + "grad_norm": 0.8579733371734619, + "learning_rate": 3.0759962846188024e-06, + "loss": 0.6291, + "step": 6979 + }, + { + "epoch": 2.5766674870785136, + "grad_norm": 0.9338570833206177, + "learning_rate": 3.07552298409176e-06, + "loss": 0.6382, + "step": 6980 + }, + { + "epoch": 2.5770366724095495, + "grad_norm": 0.8687312602996826, + "learning_rate": 3.0750496617813534e-06, + "loss": 0.5973, + "step": 6981 + }, + { + "epoch": 2.577405857740586, + "grad_norm": 0.8791952133178711, + "learning_rate": 3.0745763177055007e-06, + "loss": 0.605, + "step": 6982 + }, + { + "epoch": 2.5777750430716218, + "grad_norm": 0.9194855093955994, + "learning_rate": 3.0741029518821142e-06, + "loss": 0.6296, + "step": 6983 + }, + { + "epoch": 2.578144228402658, + "grad_norm": 0.8850800395011902, + "learning_rate": 3.0736295643291127e-06, + "loss": 0.5908, + "step": 6984 + }, + { + "epoch": 2.5785134137336945, + "grad_norm": 0.8509643077850342, + "learning_rate": 3.073156155064413e-06, + "loss": 0.6102, + "step": 6985 + }, + { + "epoch": 2.5788825990647304, + "grad_norm": 0.8902988433837891, + "learning_rate": 3.072682724105933e-06, + "loss": 0.5882, + "step": 6986 + }, + { + "epoch": 2.5792517843957667, + "grad_norm": 0.843321681022644, + "learning_rate": 3.0722092714715935e-06, + "loss": 0.5758, + "step": 6987 + }, + { + "epoch": 2.579620969726803, + "grad_norm": 0.9156758189201355, + "learning_rate": 3.071735797179314e-06, + "loss": 0.6424, + "step": 6988 + }, + { + "epoch": 2.579990155057839, + "grad_norm": 0.896797776222229, + "learning_rate": 3.0712623012470144e-06, + "loss": 0.6614, + "step": 6989 + }, + { + "epoch": 2.5803593403888754, + "grad_norm": 0.8729121088981628, + "learning_rate": 3.0707887836926175e-06, + "loss": 0.6215, + "step": 6990 + }, + { + "epoch": 2.5807285257199113, + "grad_norm": 0.8963976502418518, + "learning_rate": 3.070315244534044e-06, + "loss": 0.6568, + "step": 6991 + }, + { + "epoch": 2.5810977110509477, + "grad_norm": 0.8836588263511658, + "learning_rate": 3.06984168378922e-06, + "loss": 0.5956, + "step": 6992 + }, + { + "epoch": 2.5814668963819836, + "grad_norm": 0.9038910865783691, + "learning_rate": 3.0693681014760673e-06, + "loss": 0.6155, + "step": 6993 + }, + { + "epoch": 2.58183608171302, + "grad_norm": 0.9010583162307739, + "learning_rate": 3.068894497612512e-06, + "loss": 0.6333, + "step": 6994 + }, + { + "epoch": 2.5822052670440563, + "grad_norm": 0.8914360404014587, + "learning_rate": 3.068420872216479e-06, + "loss": 0.6324, + "step": 6995 + }, + { + "epoch": 2.582574452375092, + "grad_norm": 0.882187008857727, + "learning_rate": 3.067947225305896e-06, + "loss": 0.6454, + "step": 6996 + }, + { + "epoch": 2.5829436377061286, + "grad_norm": 0.8672683835029602, + "learning_rate": 3.0674735568986897e-06, + "loss": 0.6034, + "step": 6997 + }, + { + "epoch": 2.583312823037165, + "grad_norm": 1.00066339969635, + "learning_rate": 3.066999867012788e-06, + "loss": 0.6265, + "step": 6998 + }, + { + "epoch": 2.583682008368201, + "grad_norm": 0.9051196575164795, + "learning_rate": 3.0665261556661206e-06, + "loss": 0.6228, + "step": 6999 + }, + { + "epoch": 2.5840511936992367, + "grad_norm": 0.9181146025657654, + "learning_rate": 3.066052422876617e-06, + "loss": 0.6267, + "step": 7000 + }, + { + "epoch": 2.584420379030273, + "grad_norm": 0.871817409992218, + "learning_rate": 3.0655786686622074e-06, + "loss": 0.5632, + "step": 7001 + }, + { + "epoch": 2.5847895643613095, + "grad_norm": 0.8921700119972229, + "learning_rate": 3.0651048930408244e-06, + "loss": 0.6216, + "step": 7002 + }, + { + "epoch": 2.5851587496923454, + "grad_norm": 0.8927743434906006, + "learning_rate": 3.064631096030399e-06, + "loss": 0.5839, + "step": 7003 + }, + { + "epoch": 2.5855279350233817, + "grad_norm": 0.8714900612831116, + "learning_rate": 3.0641572776488655e-06, + "loss": 0.6011, + "step": 7004 + }, + { + "epoch": 2.585897120354418, + "grad_norm": 0.884258508682251, + "learning_rate": 3.0636834379141556e-06, + "loss": 0.6088, + "step": 7005 + }, + { + "epoch": 2.586266305685454, + "grad_norm": 0.9099177718162537, + "learning_rate": 3.063209576844207e-06, + "loss": 0.621, + "step": 7006 + }, + { + "epoch": 2.5866354910164904, + "grad_norm": 0.8893060088157654, + "learning_rate": 3.062735694456953e-06, + "loss": 0.6565, + "step": 7007 + }, + { + "epoch": 2.5870046763475267, + "grad_norm": 0.9135155081748962, + "learning_rate": 3.062261790770331e-06, + "loss": 0.6019, + "step": 7008 + }, + { + "epoch": 2.5873738616785626, + "grad_norm": 0.9089148640632629, + "learning_rate": 3.0617878658022774e-06, + "loss": 0.681, + "step": 7009 + }, + { + "epoch": 2.5877430470095986, + "grad_norm": 0.8381227254867554, + "learning_rate": 3.0613139195707304e-06, + "loss": 0.5723, + "step": 7010 + }, + { + "epoch": 2.588112232340635, + "grad_norm": 0.9066243767738342, + "learning_rate": 3.060839952093628e-06, + "loss": 0.6459, + "step": 7011 + }, + { + "epoch": 2.5884814176716713, + "grad_norm": 0.8842077851295471, + "learning_rate": 3.0603659633889116e-06, + "loss": 0.6433, + "step": 7012 + }, + { + "epoch": 2.588850603002707, + "grad_norm": 0.8734721541404724, + "learning_rate": 3.0598919534745198e-06, + "loss": 0.5952, + "step": 7013 + }, + { + "epoch": 2.5892197883337436, + "grad_norm": 0.903427243232727, + "learning_rate": 3.0594179223683944e-06, + "loss": 0.6193, + "step": 7014 + }, + { + "epoch": 2.58958897366478, + "grad_norm": 0.8896883130073547, + "learning_rate": 3.0589438700884766e-06, + "loss": 0.629, + "step": 7015 + }, + { + "epoch": 2.589958158995816, + "grad_norm": 0.9066462516784668, + "learning_rate": 3.0584697966527106e-06, + "loss": 0.629, + "step": 7016 + }, + { + "epoch": 2.590327344326852, + "grad_norm": 0.8771199584007263, + "learning_rate": 3.0579957020790384e-06, + "loss": 0.6094, + "step": 7017 + }, + { + "epoch": 2.590696529657888, + "grad_norm": 0.9033408761024475, + "learning_rate": 3.0575215863854056e-06, + "loss": 0.621, + "step": 7018 + }, + { + "epoch": 2.5910657149889245, + "grad_norm": 0.877863883972168, + "learning_rate": 3.0570474495897558e-06, + "loss": 0.5839, + "step": 7019 + }, + { + "epoch": 2.5914349003199604, + "grad_norm": 0.8779619336128235, + "learning_rate": 3.0565732917100367e-06, + "loss": 0.5908, + "step": 7020 + }, + { + "epoch": 2.5918040856509967, + "grad_norm": 0.8939217329025269, + "learning_rate": 3.0560991127641936e-06, + "loss": 0.6229, + "step": 7021 + }, + { + "epoch": 2.592173270982033, + "grad_norm": 0.88267582654953, + "learning_rate": 3.0556249127701755e-06, + "loss": 0.6093, + "step": 7022 + }, + { + "epoch": 2.592542456313069, + "grad_norm": 0.8630437850952148, + "learning_rate": 3.055150691745929e-06, + "loss": 0.5876, + "step": 7023 + }, + { + "epoch": 2.5929116416441054, + "grad_norm": 0.8857970833778381, + "learning_rate": 3.054676449709404e-06, + "loss": 0.5908, + "step": 7024 + }, + { + "epoch": 2.5932808269751417, + "grad_norm": 0.8852280378341675, + "learning_rate": 3.0542021866785505e-06, + "loss": 0.6143, + "step": 7025 + }, + { + "epoch": 2.5936500123061776, + "grad_norm": 0.908075749874115, + "learning_rate": 3.0537279026713196e-06, + "loss": 0.6132, + "step": 7026 + }, + { + "epoch": 2.594019197637214, + "grad_norm": 1.0957411527633667, + "learning_rate": 3.0532535977056616e-06, + "loss": 0.6118, + "step": 7027 + }, + { + "epoch": 2.59438838296825, + "grad_norm": 0.9042800068855286, + "learning_rate": 3.05277927179953e-06, + "loss": 0.6132, + "step": 7028 + }, + { + "epoch": 2.5947575682992863, + "grad_norm": 0.8691474199295044, + "learning_rate": 3.052304924970877e-06, + "loss": 0.5742, + "step": 7029 + }, + { + "epoch": 2.595126753630322, + "grad_norm": 0.8715176582336426, + "learning_rate": 3.051830557237657e-06, + "loss": 0.5893, + "step": 7030 + }, + { + "epoch": 2.5954959389613586, + "grad_norm": 0.9288526773452759, + "learning_rate": 3.051356168617825e-06, + "loss": 0.6172, + "step": 7031 + }, + { + "epoch": 2.595865124292395, + "grad_norm": 0.8569787740707397, + "learning_rate": 3.0508817591293367e-06, + "loss": 0.591, + "step": 7032 + }, + { + "epoch": 2.596234309623431, + "grad_norm": 0.9151125550270081, + "learning_rate": 3.0504073287901466e-06, + "loss": 0.607, + "step": 7033 + }, + { + "epoch": 2.596603494954467, + "grad_norm": 0.8554785847663879, + "learning_rate": 3.0499328776182125e-06, + "loss": 0.6156, + "step": 7034 + }, + { + "epoch": 2.5969726802855035, + "grad_norm": 0.9016808867454529, + "learning_rate": 3.049458405631493e-06, + "loss": 0.6181, + "step": 7035 + }, + { + "epoch": 2.5973418656165395, + "grad_norm": 0.8616736531257629, + "learning_rate": 3.048983912847947e-06, + "loss": 0.6045, + "step": 7036 + }, + { + "epoch": 2.597711050947576, + "grad_norm": 0.9051918387413025, + "learning_rate": 3.048509399285532e-06, + "loss": 0.6636, + "step": 7037 + }, + { + "epoch": 2.5980802362786117, + "grad_norm": 0.9101077914237976, + "learning_rate": 3.04803486496221e-06, + "loss": 0.6016, + "step": 7038 + }, + { + "epoch": 2.598449421609648, + "grad_norm": 0.8704349398612976, + "learning_rate": 3.0475603098959404e-06, + "loss": 0.6259, + "step": 7039 + }, + { + "epoch": 2.598818606940684, + "grad_norm": 0.8901421427726746, + "learning_rate": 3.047085734104687e-06, + "loss": 0.6325, + "step": 7040 + }, + { + "epoch": 2.5991877922717204, + "grad_norm": 0.8953085541725159, + "learning_rate": 3.046611137606411e-06, + "loss": 0.621, + "step": 7041 + }, + { + "epoch": 2.5995569776027567, + "grad_norm": 0.8908173441886902, + "learning_rate": 3.046136520419076e-06, + "loss": 0.617, + "step": 7042 + }, + { + "epoch": 2.5999261629337926, + "grad_norm": 0.9219982624053955, + "learning_rate": 3.0456618825606455e-06, + "loss": 0.626, + "step": 7043 + }, + { + "epoch": 2.600295348264829, + "grad_norm": 0.8865200281143188, + "learning_rate": 3.045187224049086e-06, + "loss": 0.6416, + "step": 7044 + }, + { + "epoch": 2.6006645335958654, + "grad_norm": 0.8842888474464417, + "learning_rate": 3.044712544902361e-06, + "loss": 0.6001, + "step": 7045 + }, + { + "epoch": 2.6010337189269013, + "grad_norm": 0.8884024620056152, + "learning_rate": 3.044237845138439e-06, + "loss": 0.6272, + "step": 7046 + }, + { + "epoch": 2.6014029042579376, + "grad_norm": 0.9466997981071472, + "learning_rate": 3.043763124775286e-06, + "loss": 0.6262, + "step": 7047 + }, + { + "epoch": 2.6017720895889735, + "grad_norm": 0.8934352397918701, + "learning_rate": 3.0432883838308703e-06, + "loss": 0.5812, + "step": 7048 + }, + { + "epoch": 2.60214127492001, + "grad_norm": 0.8789699673652649, + "learning_rate": 3.0428136223231604e-06, + "loss": 0.6225, + "step": 7049 + }, + { + "epoch": 2.602510460251046, + "grad_norm": 0.886158287525177, + "learning_rate": 3.0423388402701264e-06, + "loss": 0.6084, + "step": 7050 + }, + { + "epoch": 2.602879645582082, + "grad_norm": 0.867234468460083, + "learning_rate": 3.041864037689739e-06, + "loss": 0.5969, + "step": 7051 + }, + { + "epoch": 2.6032488309131185, + "grad_norm": 0.8820392489433289, + "learning_rate": 3.0413892145999685e-06, + "loss": 0.6274, + "step": 7052 + }, + { + "epoch": 2.6036180162441545, + "grad_norm": 0.8914090991020203, + "learning_rate": 3.040914371018786e-06, + "loss": 0.6167, + "step": 7053 + }, + { + "epoch": 2.603987201575191, + "grad_norm": 0.8990617394447327, + "learning_rate": 3.0404395069641667e-06, + "loss": 0.6156, + "step": 7054 + }, + { + "epoch": 2.604356386906227, + "grad_norm": 0.9197782278060913, + "learning_rate": 3.039964622454082e-06, + "loss": 0.6539, + "step": 7055 + }, + { + "epoch": 2.604725572237263, + "grad_norm": 0.8939939737319946, + "learning_rate": 3.0394897175065076e-06, + "loss": 0.6077, + "step": 7056 + }, + { + "epoch": 2.6050947575682994, + "grad_norm": 0.9065244793891907, + "learning_rate": 3.0390147921394165e-06, + "loss": 0.6356, + "step": 7057 + }, + { + "epoch": 2.6054639428993354, + "grad_norm": 0.9039509296417236, + "learning_rate": 3.038539846370786e-06, + "loss": 0.6339, + "step": 7058 + }, + { + "epoch": 2.6058331282303717, + "grad_norm": 0.8634424209594727, + "learning_rate": 3.038064880218592e-06, + "loss": 0.6086, + "step": 7059 + }, + { + "epoch": 2.6062023135614076, + "grad_norm": 0.9237685203552246, + "learning_rate": 3.037589893700813e-06, + "loss": 0.6239, + "step": 7060 + }, + { + "epoch": 2.606571498892444, + "grad_norm": 0.8870430588722229, + "learning_rate": 3.0371148868354257e-06, + "loss": 0.6459, + "step": 7061 + }, + { + "epoch": 2.6069406842234804, + "grad_norm": 0.9049957990646362, + "learning_rate": 3.036639859640409e-06, + "loss": 0.6073, + "step": 7062 + }, + { + "epoch": 2.6073098695545163, + "grad_norm": 0.8738628625869751, + "learning_rate": 3.036164812133743e-06, + "loss": 0.5882, + "step": 7063 + }, + { + "epoch": 2.6076790548855526, + "grad_norm": 0.9431330561637878, + "learning_rate": 3.035689744333409e-06, + "loss": 0.6135, + "step": 7064 + }, + { + "epoch": 2.608048240216589, + "grad_norm": 0.9258922934532166, + "learning_rate": 3.035214656257387e-06, + "loss": 0.6267, + "step": 7065 + }, + { + "epoch": 2.608417425547625, + "grad_norm": 0.9085554480552673, + "learning_rate": 3.0347395479236593e-06, + "loss": 0.644, + "step": 7066 + }, + { + "epoch": 2.608786610878661, + "grad_norm": 0.8690418601036072, + "learning_rate": 3.0342644193502076e-06, + "loss": 0.6138, + "step": 7067 + }, + { + "epoch": 2.609155796209697, + "grad_norm": 0.9234104752540588, + "learning_rate": 3.0337892705550165e-06, + "loss": 0.6001, + "step": 7068 + }, + { + "epoch": 2.6095249815407335, + "grad_norm": 0.8454982042312622, + "learning_rate": 3.0333141015560704e-06, + "loss": 0.6083, + "step": 7069 + }, + { + "epoch": 2.6098941668717694, + "grad_norm": 0.8648694753646851, + "learning_rate": 3.0328389123713538e-06, + "loss": 0.6184, + "step": 7070 + }, + { + "epoch": 2.610263352202806, + "grad_norm": 0.9105032682418823, + "learning_rate": 3.0323637030188517e-06, + "loss": 0.6181, + "step": 7071 + }, + { + "epoch": 2.610632537533842, + "grad_norm": 0.8854820728302002, + "learning_rate": 3.031888473516552e-06, + "loss": 0.6124, + "step": 7072 + }, + { + "epoch": 2.611001722864878, + "grad_norm": 0.9082671403884888, + "learning_rate": 3.0314132238824416e-06, + "loss": 0.6139, + "step": 7073 + }, + { + "epoch": 2.6113709081959144, + "grad_norm": 0.8930485248565674, + "learning_rate": 3.030937954134509e-06, + "loss": 0.5966, + "step": 7074 + }, + { + "epoch": 2.611740093526951, + "grad_norm": 0.8825952410697937, + "learning_rate": 3.0304626642907412e-06, + "loss": 0.6336, + "step": 7075 + }, + { + "epoch": 2.6121092788579867, + "grad_norm": 0.8869221806526184, + "learning_rate": 3.02998735436913e-06, + "loss": 0.6111, + "step": 7076 + }, + { + "epoch": 2.6124784641890226, + "grad_norm": 0.9121037125587463, + "learning_rate": 3.029512024387664e-06, + "loss": 0.6577, + "step": 7077 + }, + { + "epoch": 2.612847649520059, + "grad_norm": 0.8995426297187805, + "learning_rate": 3.0290366743643355e-06, + "loss": 0.6294, + "step": 7078 + }, + { + "epoch": 2.6132168348510953, + "grad_norm": 0.92025226354599, + "learning_rate": 3.0285613043171354e-06, + "loss": 0.6181, + "step": 7079 + }, + { + "epoch": 2.6135860201821313, + "grad_norm": 0.9120648503303528, + "learning_rate": 3.0280859142640572e-06, + "loss": 0.6295, + "step": 7080 + }, + { + "epoch": 2.6139552055131676, + "grad_norm": 0.916422963142395, + "learning_rate": 3.0276105042230937e-06, + "loss": 0.6536, + "step": 7081 + }, + { + "epoch": 2.614324390844204, + "grad_norm": 0.918430507183075, + "learning_rate": 3.0271350742122387e-06, + "loss": 0.5955, + "step": 7082 + }, + { + "epoch": 2.61469357617524, + "grad_norm": 0.9215174317359924, + "learning_rate": 3.026659624249488e-06, + "loss": 0.6453, + "step": 7083 + }, + { + "epoch": 2.6150627615062763, + "grad_norm": 0.8928914070129395, + "learning_rate": 3.0261841543528365e-06, + "loss": 0.6321, + "step": 7084 + }, + { + "epoch": 2.615431946837312, + "grad_norm": 0.8853437900543213, + "learning_rate": 3.0257086645402806e-06, + "loss": 0.6386, + "step": 7085 + }, + { + "epoch": 2.6158011321683485, + "grad_norm": 0.8936126828193665, + "learning_rate": 3.025233154829818e-06, + "loss": 0.6425, + "step": 7086 + }, + { + "epoch": 2.6161703174993844, + "grad_norm": 0.8885867595672607, + "learning_rate": 3.024757625239446e-06, + "loss": 0.6082, + "step": 7087 + }, + { + "epoch": 2.616539502830421, + "grad_norm": 0.8928549885749817, + "learning_rate": 3.0242820757871637e-06, + "loss": 0.6063, + "step": 7088 + }, + { + "epoch": 2.616908688161457, + "grad_norm": 0.9100514054298401, + "learning_rate": 3.0238065064909706e-06, + "loss": 0.6488, + "step": 7089 + }, + { + "epoch": 2.617277873492493, + "grad_norm": 0.8759944438934326, + "learning_rate": 3.0233309173688665e-06, + "loss": 0.6331, + "step": 7090 + }, + { + "epoch": 2.6176470588235294, + "grad_norm": 0.883312463760376, + "learning_rate": 3.022855308438852e-06, + "loss": 0.5844, + "step": 7091 + }, + { + "epoch": 2.618016244154566, + "grad_norm": 0.8792973160743713, + "learning_rate": 3.0223796797189287e-06, + "loss": 0.6072, + "step": 7092 + }, + { + "epoch": 2.6183854294856017, + "grad_norm": 0.884583592414856, + "learning_rate": 3.0219040312271e-06, + "loss": 0.6082, + "step": 7093 + }, + { + "epoch": 2.618754614816638, + "grad_norm": 0.9293040633201599, + "learning_rate": 3.0214283629813683e-06, + "loss": 0.6097, + "step": 7094 + }, + { + "epoch": 2.619123800147674, + "grad_norm": 0.9017032980918884, + "learning_rate": 3.0209526749997373e-06, + "loss": 0.6266, + "step": 7095 + }, + { + "epoch": 2.6194929854787103, + "grad_norm": 0.8607946038246155, + "learning_rate": 3.0204769673002123e-06, + "loss": 0.5834, + "step": 7096 + }, + { + "epoch": 2.6198621708097463, + "grad_norm": 0.904453456401825, + "learning_rate": 3.020001239900799e-06, + "loss": 0.5752, + "step": 7097 + }, + { + "epoch": 2.6202313561407826, + "grad_norm": 0.8731056451797485, + "learning_rate": 3.0195254928195016e-06, + "loss": 0.5726, + "step": 7098 + }, + { + "epoch": 2.620600541471819, + "grad_norm": 0.9241552948951721, + "learning_rate": 3.019049726074329e-06, + "loss": 0.6288, + "step": 7099 + }, + { + "epoch": 2.620969726802855, + "grad_norm": 0.9078872799873352, + "learning_rate": 3.0185739396832873e-06, + "loss": 0.617, + "step": 7100 + }, + { + "epoch": 2.6213389121338913, + "grad_norm": 0.8489261269569397, + "learning_rate": 3.018098133664387e-06, + "loss": 0.5938, + "step": 7101 + }, + { + "epoch": 2.6217080974649276, + "grad_norm": 0.8798846006393433, + "learning_rate": 3.017622308035635e-06, + "loss": 0.6274, + "step": 7102 + }, + { + "epoch": 2.6220772827959635, + "grad_norm": 0.8798366189002991, + "learning_rate": 3.0171464628150423e-06, + "loss": 0.6457, + "step": 7103 + }, + { + "epoch": 2.622446468127, + "grad_norm": 0.9158898591995239, + "learning_rate": 3.016670598020619e-06, + "loss": 0.5973, + "step": 7104 + }, + { + "epoch": 2.622815653458036, + "grad_norm": 0.8889309167861938, + "learning_rate": 3.016194713670377e-06, + "loss": 0.6098, + "step": 7105 + }, + { + "epoch": 2.623184838789072, + "grad_norm": 0.8890911936759949, + "learning_rate": 3.015718809782327e-06, + "loss": 0.6081, + "step": 7106 + }, + { + "epoch": 2.623554024120108, + "grad_norm": 0.8565577864646912, + "learning_rate": 3.015242886374484e-06, + "loss": 0.5977, + "step": 7107 + }, + { + "epoch": 2.6239232094511444, + "grad_norm": 0.8981603384017944, + "learning_rate": 3.0147669434648597e-06, + "loss": 0.6229, + "step": 7108 + }, + { + "epoch": 2.624292394782181, + "grad_norm": 0.8955033421516418, + "learning_rate": 3.014290981071469e-06, + "loss": 0.6579, + "step": 7109 + }, + { + "epoch": 2.6246615801132167, + "grad_norm": 0.9243045449256897, + "learning_rate": 3.013814999212327e-06, + "loss": 0.6359, + "step": 7110 + }, + { + "epoch": 2.625030765444253, + "grad_norm": 0.8738952875137329, + "learning_rate": 3.01333899790545e-06, + "loss": 0.585, + "step": 7111 + }, + { + "epoch": 2.6253999507752894, + "grad_norm": 0.886225163936615, + "learning_rate": 3.0128629771688533e-06, + "loss": 0.6025, + "step": 7112 + }, + { + "epoch": 2.6257691361063253, + "grad_norm": 0.8751848340034485, + "learning_rate": 3.012386937020555e-06, + "loss": 0.6079, + "step": 7113 + }, + { + "epoch": 2.6261383214373617, + "grad_norm": 0.9022886157035828, + "learning_rate": 3.0119108774785726e-06, + "loss": 0.6179, + "step": 7114 + }, + { + "epoch": 2.6265075067683976, + "grad_norm": 0.8590001463890076, + "learning_rate": 3.011434798560926e-06, + "loss": 0.6186, + "step": 7115 + }, + { + "epoch": 2.626876692099434, + "grad_norm": 0.8915167450904846, + "learning_rate": 3.010958700285633e-06, + "loss": 0.6219, + "step": 7116 + }, + { + "epoch": 2.62724587743047, + "grad_norm": 0.8990417718887329, + "learning_rate": 3.010482582670714e-06, + "loss": 0.6537, + "step": 7117 + }, + { + "epoch": 2.6276150627615062, + "grad_norm": 0.8935746550559998, + "learning_rate": 3.0100064457341916e-06, + "loss": 0.596, + "step": 7118 + }, + { + "epoch": 2.6279842480925426, + "grad_norm": 0.8513423800468445, + "learning_rate": 3.0095302894940847e-06, + "loss": 0.5955, + "step": 7119 + }, + { + "epoch": 2.6283534334235785, + "grad_norm": 0.876732587814331, + "learning_rate": 3.0090541139684183e-06, + "loss": 0.6591, + "step": 7120 + }, + { + "epoch": 2.628722618754615, + "grad_norm": 0.9054352045059204, + "learning_rate": 3.008577919175214e-06, + "loss": 0.6327, + "step": 7121 + }, + { + "epoch": 2.6290918040856512, + "grad_norm": 0.875978410243988, + "learning_rate": 3.0081017051324956e-06, + "loss": 0.5928, + "step": 7122 + }, + { + "epoch": 2.629460989416687, + "grad_norm": 0.8769848942756653, + "learning_rate": 3.0076254718582887e-06, + "loss": 0.6014, + "step": 7123 + }, + { + "epoch": 2.6298301747477235, + "grad_norm": 0.881517231464386, + "learning_rate": 3.007149219370617e-06, + "loss": 0.6318, + "step": 7124 + }, + { + "epoch": 2.6301993600787594, + "grad_norm": 0.8954079747200012, + "learning_rate": 3.0066729476875084e-06, + "loss": 0.6025, + "step": 7125 + }, + { + "epoch": 2.630568545409796, + "grad_norm": 0.9137060046195984, + "learning_rate": 3.0061966568269878e-06, + "loss": 0.6425, + "step": 7126 + }, + { + "epoch": 2.6309377307408317, + "grad_norm": 0.8875525593757629, + "learning_rate": 3.005720346807084e-06, + "loss": 0.6016, + "step": 7127 + }, + { + "epoch": 2.631306916071868, + "grad_norm": 0.8944580554962158, + "learning_rate": 3.005244017645824e-06, + "loss": 0.6508, + "step": 7128 + }, + { + "epoch": 2.6316761014029044, + "grad_norm": 0.9043740630149841, + "learning_rate": 3.0047676693612387e-06, + "loss": 0.606, + "step": 7129 + }, + { + "epoch": 2.6320452867339403, + "grad_norm": 0.8797224164009094, + "learning_rate": 3.0042913019713552e-06, + "loss": 0.6123, + "step": 7130 + }, + { + "epoch": 2.6324144720649767, + "grad_norm": 0.9097487926483154, + "learning_rate": 3.003814915494206e-06, + "loss": 0.5993, + "step": 7131 + }, + { + "epoch": 2.632783657396013, + "grad_norm": 0.8702475428581238, + "learning_rate": 3.003338509947821e-06, + "loss": 0.6207, + "step": 7132 + }, + { + "epoch": 2.633152842727049, + "grad_norm": 0.9098284840583801, + "learning_rate": 3.0028620853502317e-06, + "loss": 0.6388, + "step": 7133 + }, + { + "epoch": 2.633522028058085, + "grad_norm": 0.9269683957099915, + "learning_rate": 3.002385641719471e-06, + "loss": 0.6381, + "step": 7134 + }, + { + "epoch": 2.6338912133891212, + "grad_norm": 0.8912243247032166, + "learning_rate": 3.0019091790735732e-06, + "loss": 0.6319, + "step": 7135 + }, + { + "epoch": 2.6342603987201576, + "grad_norm": 0.8965877890586853, + "learning_rate": 3.0014326974305715e-06, + "loss": 0.6101, + "step": 7136 + }, + { + "epoch": 2.6346295840511935, + "grad_norm": 0.9039759039878845, + "learning_rate": 3.0009561968084994e-06, + "loss": 0.6663, + "step": 7137 + }, + { + "epoch": 2.63499876938223, + "grad_norm": 0.9153173565864563, + "learning_rate": 3.0004796772253935e-06, + "loss": 0.6336, + "step": 7138 + }, + { + "epoch": 2.6353679547132662, + "grad_norm": 0.8848626017570496, + "learning_rate": 3.0000031386992907e-06, + "loss": 0.6182, + "step": 7139 + }, + { + "epoch": 2.635737140044302, + "grad_norm": 0.915961742401123, + "learning_rate": 2.9995265812482264e-06, + "loss": 0.6386, + "step": 7140 + }, + { + "epoch": 2.6361063253753385, + "grad_norm": 0.8771063089370728, + "learning_rate": 2.999050004890239e-06, + "loss": 0.606, + "step": 7141 + }, + { + "epoch": 2.636475510706375, + "grad_norm": 0.8629640936851501, + "learning_rate": 2.998573409643366e-06, + "loss": 0.5951, + "step": 7142 + }, + { + "epoch": 2.636844696037411, + "grad_norm": 0.9081624150276184, + "learning_rate": 2.9980967955256467e-06, + "loss": 0.5683, + "step": 7143 + }, + { + "epoch": 2.6372138813684467, + "grad_norm": 0.8858315944671631, + "learning_rate": 2.9976201625551204e-06, + "loss": 0.5994, + "step": 7144 + }, + { + "epoch": 2.637583066699483, + "grad_norm": 0.9010271430015564, + "learning_rate": 2.9971435107498293e-06, + "loss": 0.6117, + "step": 7145 + }, + { + "epoch": 2.6379522520305194, + "grad_norm": 0.9087921380996704, + "learning_rate": 2.9966668401278126e-06, + "loss": 0.6466, + "step": 7146 + }, + { + "epoch": 2.6383214373615553, + "grad_norm": 0.888828694820404, + "learning_rate": 2.9961901507071123e-06, + "loss": 0.6299, + "step": 7147 + }, + { + "epoch": 2.6386906226925917, + "grad_norm": 0.8904872536659241, + "learning_rate": 2.995713442505772e-06, + "loss": 0.6426, + "step": 7148 + }, + { + "epoch": 2.639059808023628, + "grad_norm": 0.9074708819389343, + "learning_rate": 2.9952367155418343e-06, + "loss": 0.617, + "step": 7149 + }, + { + "epoch": 2.639428993354664, + "grad_norm": 0.9419956207275391, + "learning_rate": 2.994759969833343e-06, + "loss": 0.6225, + "step": 7150 + }, + { + "epoch": 2.6397981786857003, + "grad_norm": 0.8976385593414307, + "learning_rate": 2.9942832053983433e-06, + "loss": 0.6196, + "step": 7151 + }, + { + "epoch": 2.6401673640167362, + "grad_norm": 0.8638802766799927, + "learning_rate": 2.9938064222548797e-06, + "loss": 0.5941, + "step": 7152 + }, + { + "epoch": 2.6405365493477726, + "grad_norm": 0.8906550407409668, + "learning_rate": 2.9933296204209996e-06, + "loss": 0.5924, + "step": 7153 + }, + { + "epoch": 2.6409057346788085, + "grad_norm": 0.8825253248214722, + "learning_rate": 2.9928527999147483e-06, + "loss": 0.6014, + "step": 7154 + }, + { + "epoch": 2.641274920009845, + "grad_norm": 0.8723636269569397, + "learning_rate": 2.992375960754175e-06, + "loss": 0.5939, + "step": 7155 + }, + { + "epoch": 2.6416441053408812, + "grad_norm": 0.9116057753562927, + "learning_rate": 2.9918991029573264e-06, + "loss": 0.6344, + "step": 7156 + }, + { + "epoch": 2.642013290671917, + "grad_norm": 0.8967413902282715, + "learning_rate": 2.9914222265422517e-06, + "loss": 0.6089, + "step": 7157 + }, + { + "epoch": 2.6423824760029535, + "grad_norm": 0.8796014189720154, + "learning_rate": 2.990945331527001e-06, + "loss": 0.623, + "step": 7158 + }, + { + "epoch": 2.64275166133399, + "grad_norm": 0.8997879028320312, + "learning_rate": 2.990468417929625e-06, + "loss": 0.6113, + "step": 7159 + }, + { + "epoch": 2.643120846665026, + "grad_norm": 0.8823813199996948, + "learning_rate": 2.9899914857681734e-06, + "loss": 0.615, + "step": 7160 + }, + { + "epoch": 2.643490031996062, + "grad_norm": 0.9131659269332886, + "learning_rate": 2.9895145350606995e-06, + "loss": 0.6327, + "step": 7161 + }, + { + "epoch": 2.643859217327098, + "grad_norm": 0.9174714684486389, + "learning_rate": 2.9890375658252535e-06, + "loss": 0.5708, + "step": 7162 + }, + { + "epoch": 2.6442284026581344, + "grad_norm": 0.9009746313095093, + "learning_rate": 2.9885605780798917e-06, + "loss": 0.6837, + "step": 7163 + }, + { + "epoch": 2.6445975879891703, + "grad_norm": 0.9158111214637756, + "learning_rate": 2.9880835718426653e-06, + "loss": 0.6287, + "step": 7164 + }, + { + "epoch": 2.6449667733202067, + "grad_norm": 0.9041900038719177, + "learning_rate": 2.9876065471316303e-06, + "loss": 0.6231, + "step": 7165 + }, + { + "epoch": 2.645335958651243, + "grad_norm": 0.9165393710136414, + "learning_rate": 2.9871295039648406e-06, + "loss": 0.6273, + "step": 7166 + }, + { + "epoch": 2.645705143982279, + "grad_norm": 0.8887408971786499, + "learning_rate": 2.986652442360354e-06, + "loss": 0.6103, + "step": 7167 + }, + { + "epoch": 2.6460743293133153, + "grad_norm": 0.8837975859642029, + "learning_rate": 2.986175362336225e-06, + "loss": 0.61, + "step": 7168 + }, + { + "epoch": 2.6464435146443517, + "grad_norm": 0.9089862108230591, + "learning_rate": 2.9856982639105124e-06, + "loss": 0.6259, + "step": 7169 + }, + { + "epoch": 2.6468126999753876, + "grad_norm": 0.8547838926315308, + "learning_rate": 2.9852211471012738e-06, + "loss": 0.5877, + "step": 7170 + }, + { + "epoch": 2.647181885306424, + "grad_norm": 0.8702665567398071, + "learning_rate": 2.9847440119265673e-06, + "loss": 0.6311, + "step": 7171 + }, + { + "epoch": 2.64755107063746, + "grad_norm": 0.9047566652297974, + "learning_rate": 2.984266858404454e-06, + "loss": 0.6062, + "step": 7172 + }, + { + "epoch": 2.6479202559684962, + "grad_norm": 0.9032107591629028, + "learning_rate": 2.9837896865529924e-06, + "loss": 0.6051, + "step": 7173 + }, + { + "epoch": 2.648289441299532, + "grad_norm": 0.896721601486206, + "learning_rate": 2.9833124963902436e-06, + "loss": 0.6138, + "step": 7174 + }, + { + "epoch": 2.6486586266305685, + "grad_norm": 0.9422100186347961, + "learning_rate": 2.9828352879342704e-06, + "loss": 0.6288, + "step": 7175 + }, + { + "epoch": 2.649027811961605, + "grad_norm": 0.8689942359924316, + "learning_rate": 2.9823580612031323e-06, + "loss": 0.6147, + "step": 7176 + }, + { + "epoch": 2.6493969972926408, + "grad_norm": 0.9040666222572327, + "learning_rate": 2.9818808162148953e-06, + "loss": 0.6162, + "step": 7177 + }, + { + "epoch": 2.649766182623677, + "grad_norm": 0.9080602526664734, + "learning_rate": 2.9814035529876206e-06, + "loss": 0.618, + "step": 7178 + }, + { + "epoch": 2.6501353679547135, + "grad_norm": 0.9206766486167908, + "learning_rate": 2.9809262715393743e-06, + "loss": 0.616, + "step": 7179 + }, + { + "epoch": 2.6505045532857494, + "grad_norm": 0.8760641813278198, + "learning_rate": 2.9804489718882197e-06, + "loss": 0.6152, + "step": 7180 + }, + { + "epoch": 2.6508737386167858, + "grad_norm": 0.8687725067138672, + "learning_rate": 2.9799716540522227e-06, + "loss": 0.5723, + "step": 7181 + }, + { + "epoch": 2.6512429239478217, + "grad_norm": 0.8950977921485901, + "learning_rate": 2.9794943180494506e-06, + "loss": 0.6328, + "step": 7182 + }, + { + "epoch": 2.651612109278858, + "grad_norm": 0.8995038866996765, + "learning_rate": 2.9790169638979705e-06, + "loss": 0.6024, + "step": 7183 + }, + { + "epoch": 2.651981294609894, + "grad_norm": 0.8624200224876404, + "learning_rate": 2.978539591615848e-06, + "loss": 0.5837, + "step": 7184 + }, + { + "epoch": 2.6523504799409303, + "grad_norm": 0.9204958081245422, + "learning_rate": 2.9780622012211543e-06, + "loss": 0.6447, + "step": 7185 + }, + { + "epoch": 2.6527196652719667, + "grad_norm": 0.8609824776649475, + "learning_rate": 2.9775847927319567e-06, + "loss": 0.6319, + "step": 7186 + }, + { + "epoch": 2.6530888506030026, + "grad_norm": 0.8934695720672607, + "learning_rate": 2.9771073661663256e-06, + "loss": 0.6041, + "step": 7187 + }, + { + "epoch": 2.653458035934039, + "grad_norm": 0.9055076837539673, + "learning_rate": 2.976629921542331e-06, + "loss": 0.6011, + "step": 7188 + }, + { + "epoch": 2.6538272212650753, + "grad_norm": 0.9772597551345825, + "learning_rate": 2.976152458878045e-06, + "loss": 0.6488, + "step": 7189 + }, + { + "epoch": 2.6541964065961112, + "grad_norm": 0.8715004920959473, + "learning_rate": 2.9756749781915373e-06, + "loss": 0.5796, + "step": 7190 + }, + { + "epoch": 2.6545655919271476, + "grad_norm": 0.8810442686080933, + "learning_rate": 2.975197479500883e-06, + "loss": 0.6158, + "step": 7191 + }, + { + "epoch": 2.6549347772581835, + "grad_norm": 0.9187114238739014, + "learning_rate": 2.9747199628241536e-06, + "loss": 0.6417, + "step": 7192 + }, + { + "epoch": 2.65530396258922, + "grad_norm": 0.8992149829864502, + "learning_rate": 2.9742424281794236e-06, + "loss": 0.6044, + "step": 7193 + }, + { + "epoch": 2.6556731479202558, + "grad_norm": 0.9060842394828796, + "learning_rate": 2.9737648755847674e-06, + "loss": 0.6127, + "step": 7194 + }, + { + "epoch": 2.656042333251292, + "grad_norm": 0.878780722618103, + "learning_rate": 2.97328730505826e-06, + "loss": 0.6415, + "step": 7195 + }, + { + "epoch": 2.6564115185823285, + "grad_norm": 0.8782097101211548, + "learning_rate": 2.972809716617977e-06, + "loss": 0.6089, + "step": 7196 + }, + { + "epoch": 2.6567807039133644, + "grad_norm": 0.9186558127403259, + "learning_rate": 2.972332110281997e-06, + "loss": 0.6478, + "step": 7197 + }, + { + "epoch": 2.6571498892444008, + "grad_norm": 0.9268781542778015, + "learning_rate": 2.971854486068394e-06, + "loss": 0.6243, + "step": 7198 + }, + { + "epoch": 2.657519074575437, + "grad_norm": 0.9608319401741028, + "learning_rate": 2.971376843995249e-06, + "loss": 0.6351, + "step": 7199 + }, + { + "epoch": 2.657888259906473, + "grad_norm": 0.932717502117157, + "learning_rate": 2.9708991840806378e-06, + "loss": 0.6329, + "step": 7200 + }, + { + "epoch": 2.658257445237509, + "grad_norm": 0.8602195978164673, + "learning_rate": 2.9704215063426425e-06, + "loss": 0.6103, + "step": 7201 + }, + { + "epoch": 2.6586266305685453, + "grad_norm": 0.8831150531768799, + "learning_rate": 2.9699438107993404e-06, + "loss": 0.6003, + "step": 7202 + }, + { + "epoch": 2.6589958158995817, + "grad_norm": 0.869324266910553, + "learning_rate": 2.9694660974688144e-06, + "loss": 0.6083, + "step": 7203 + }, + { + "epoch": 2.6593650012306176, + "grad_norm": 0.9252071380615234, + "learning_rate": 2.968988366369144e-06, + "loss": 0.6468, + "step": 7204 + }, + { + "epoch": 2.659734186561654, + "grad_norm": 0.9059156179428101, + "learning_rate": 2.9685106175184124e-06, + "loss": 0.6199, + "step": 7205 + }, + { + "epoch": 2.6601033718926903, + "grad_norm": 0.8991444110870361, + "learning_rate": 2.968032850934702e-06, + "loss": 0.6495, + "step": 7206 + }, + { + "epoch": 2.660472557223726, + "grad_norm": 0.928137481212616, + "learning_rate": 2.967555066636095e-06, + "loss": 0.6135, + "step": 7207 + }, + { + "epoch": 2.6608417425547626, + "grad_norm": 0.882851779460907, + "learning_rate": 2.967077264640677e-06, + "loss": 0.5691, + "step": 7208 + }, + { + "epoch": 2.661210927885799, + "grad_norm": 0.8967177867889404, + "learning_rate": 2.966599444966532e-06, + "loss": 0.6184, + "step": 7209 + }, + { + "epoch": 2.661580113216835, + "grad_norm": 0.8906081914901733, + "learning_rate": 2.966121607631745e-06, + "loss": 0.6001, + "step": 7210 + }, + { + "epoch": 2.6619492985478708, + "grad_norm": 0.8852339386940002, + "learning_rate": 2.9656437526544023e-06, + "loss": 0.6099, + "step": 7211 + }, + { + "epoch": 2.662318483878907, + "grad_norm": 0.8790413737297058, + "learning_rate": 2.9651658800525905e-06, + "loss": 0.6255, + "step": 7212 + }, + { + "epoch": 2.6626876692099435, + "grad_norm": 0.9204836487770081, + "learning_rate": 2.9646879898443975e-06, + "loss": 0.6557, + "step": 7213 + }, + { + "epoch": 2.6630568545409794, + "grad_norm": 0.8636997938156128, + "learning_rate": 2.9642100820479097e-06, + "loss": 0.6066, + "step": 7214 + }, + { + "epoch": 2.6634260398720158, + "grad_norm": 0.9014278054237366, + "learning_rate": 2.9637321566812176e-06, + "loss": 0.6676, + "step": 7215 + }, + { + "epoch": 2.663795225203052, + "grad_norm": 0.9453534483909607, + "learning_rate": 2.96325421376241e-06, + "loss": 0.6056, + "step": 7216 + }, + { + "epoch": 2.664164410534088, + "grad_norm": 0.8730406761169434, + "learning_rate": 2.962776253309576e-06, + "loss": 0.5893, + "step": 7217 + }, + { + "epoch": 2.6645335958651244, + "grad_norm": 0.8904109001159668, + "learning_rate": 2.962298275340807e-06, + "loss": 0.5836, + "step": 7218 + }, + { + "epoch": 2.6649027811961603, + "grad_norm": 0.8845539093017578, + "learning_rate": 2.961820279874194e-06, + "loss": 0.6235, + "step": 7219 + }, + { + "epoch": 2.6652719665271967, + "grad_norm": 0.9059796929359436, + "learning_rate": 2.961342266927829e-06, + "loss": 0.6054, + "step": 7220 + }, + { + "epoch": 2.6656411518582326, + "grad_norm": 0.8766582608222961, + "learning_rate": 2.9608642365198052e-06, + "loss": 0.5649, + "step": 7221 + }, + { + "epoch": 2.666010337189269, + "grad_norm": 0.8969059586524963, + "learning_rate": 2.9603861886682158e-06, + "loss": 0.6385, + "step": 7222 + }, + { + "epoch": 2.6663795225203053, + "grad_norm": 0.9149233102798462, + "learning_rate": 2.9599081233911536e-06, + "loss": 0.6194, + "step": 7223 + }, + { + "epoch": 2.666748707851341, + "grad_norm": 0.9053657054901123, + "learning_rate": 2.9594300407067144e-06, + "loss": 0.629, + "step": 7224 + }, + { + "epoch": 2.6671178931823776, + "grad_norm": 0.8971558213233948, + "learning_rate": 2.9589519406329938e-06, + "loss": 0.5952, + "step": 7225 + }, + { + "epoch": 2.667487078513414, + "grad_norm": 0.9053114652633667, + "learning_rate": 2.9584738231880868e-06, + "loss": 0.6373, + "step": 7226 + }, + { + "epoch": 2.66785626384445, + "grad_norm": 0.9203220009803772, + "learning_rate": 2.95799568839009e-06, + "loss": 0.6423, + "step": 7227 + }, + { + "epoch": 2.668225449175486, + "grad_norm": 0.905794084072113, + "learning_rate": 2.957517536257101e-06, + "loss": 0.6206, + "step": 7228 + }, + { + "epoch": 2.668594634506522, + "grad_norm": 0.8813137412071228, + "learning_rate": 2.957039366807218e-06, + "loss": 0.6131, + "step": 7229 + }, + { + "epoch": 2.6689638198375585, + "grad_norm": 0.8910402059555054, + "learning_rate": 2.956561180058538e-06, + "loss": 0.6069, + "step": 7230 + }, + { + "epoch": 2.6693330051685944, + "grad_norm": 0.8896382451057434, + "learning_rate": 2.956082976029163e-06, + "loss": 0.6081, + "step": 7231 + }, + { + "epoch": 2.6697021904996308, + "grad_norm": 0.8452030420303345, + "learning_rate": 2.9556047547371904e-06, + "loss": 0.6012, + "step": 7232 + }, + { + "epoch": 2.670071375830667, + "grad_norm": 0.9132207632064819, + "learning_rate": 2.955126516200722e-06, + "loss": 0.6156, + "step": 7233 + }, + { + "epoch": 2.670440561161703, + "grad_norm": 0.9033889770507812, + "learning_rate": 2.954648260437858e-06, + "loss": 0.6765, + "step": 7234 + }, + { + "epoch": 2.6708097464927394, + "grad_norm": 0.8798934817314148, + "learning_rate": 2.9541699874667017e-06, + "loss": 0.6151, + "step": 7235 + }, + { + "epoch": 2.6711789318237757, + "grad_norm": 0.9126673340797424, + "learning_rate": 2.953691697305354e-06, + "loss": 0.646, + "step": 7236 + }, + { + "epoch": 2.6715481171548117, + "grad_norm": 0.8818350434303284, + "learning_rate": 2.9532133899719196e-06, + "loss": 0.5922, + "step": 7237 + }, + { + "epoch": 2.671917302485848, + "grad_norm": 0.8908956050872803, + "learning_rate": 2.9527350654845e-06, + "loss": 0.6323, + "step": 7238 + }, + { + "epoch": 2.672286487816884, + "grad_norm": 0.9066286683082581, + "learning_rate": 2.9522567238612025e-06, + "loss": 0.578, + "step": 7239 + }, + { + "epoch": 2.6726556731479203, + "grad_norm": 0.8901090621948242, + "learning_rate": 2.9517783651201296e-06, + "loss": 0.66, + "step": 7240 + }, + { + "epoch": 2.673024858478956, + "grad_norm": 0.9042249917984009, + "learning_rate": 2.951299989279389e-06, + "loss": 0.6162, + "step": 7241 + }, + { + "epoch": 2.6733940438099926, + "grad_norm": 0.8934651613235474, + "learning_rate": 2.9508215963570856e-06, + "loss": 0.6069, + "step": 7242 + }, + { + "epoch": 2.673763229141029, + "grad_norm": 0.8978985548019409, + "learning_rate": 2.950343186371328e-06, + "loss": 0.6332, + "step": 7243 + }, + { + "epoch": 2.674132414472065, + "grad_norm": 0.8886924386024475, + "learning_rate": 2.949864759340222e-06, + "loss": 0.6019, + "step": 7244 + }, + { + "epoch": 2.674501599803101, + "grad_norm": 0.9211569428443909, + "learning_rate": 2.949386315281878e-06, + "loss": 0.6327, + "step": 7245 + }, + { + "epoch": 2.6748707851341376, + "grad_norm": 0.8774517774581909, + "learning_rate": 2.9489078542144024e-06, + "loss": 0.5818, + "step": 7246 + }, + { + "epoch": 2.6752399704651735, + "grad_norm": 0.9017682671546936, + "learning_rate": 2.948429376155907e-06, + "loss": 0.6036, + "step": 7247 + }, + { + "epoch": 2.67560915579621, + "grad_norm": 0.9124335050582886, + "learning_rate": 2.9479508811245007e-06, + "loss": 0.6033, + "step": 7248 + }, + { + "epoch": 2.6759783411272458, + "grad_norm": 0.8726518750190735, + "learning_rate": 2.9474723691382957e-06, + "loss": 0.5824, + "step": 7249 + }, + { + "epoch": 2.676347526458282, + "grad_norm": 0.8885230422019958, + "learning_rate": 2.9469938402154018e-06, + "loss": 0.6068, + "step": 7250 + }, + { + "epoch": 2.676716711789318, + "grad_norm": 0.8928765654563904, + "learning_rate": 2.946515294373933e-06, + "loss": 0.6231, + "step": 7251 + }, + { + "epoch": 2.6770858971203544, + "grad_norm": 0.8943143486976624, + "learning_rate": 2.9460367316320003e-06, + "loss": 0.6282, + "step": 7252 + }, + { + "epoch": 2.6774550824513907, + "grad_norm": 0.8936024904251099, + "learning_rate": 2.945558152007719e-06, + "loss": 0.6298, + "step": 7253 + }, + { + "epoch": 2.6778242677824267, + "grad_norm": 0.8941588997840881, + "learning_rate": 2.945079555519202e-06, + "loss": 0.6313, + "step": 7254 + }, + { + "epoch": 2.678193453113463, + "grad_norm": 0.9027512073516846, + "learning_rate": 2.9446009421845646e-06, + "loss": 0.6318, + "step": 7255 + }, + { + "epoch": 2.6785626384444994, + "grad_norm": 0.8840638995170593, + "learning_rate": 2.9441223120219207e-06, + "loss": 0.5989, + "step": 7256 + }, + { + "epoch": 2.6789318237755353, + "grad_norm": 0.868406355381012, + "learning_rate": 2.9436436650493883e-06, + "loss": 0.6292, + "step": 7257 + }, + { + "epoch": 2.6793010091065717, + "grad_norm": 0.8927574753761292, + "learning_rate": 2.9431650012850834e-06, + "loss": 0.6269, + "step": 7258 + }, + { + "epoch": 2.6796701944376076, + "grad_norm": 0.917188286781311, + "learning_rate": 2.942686320747123e-06, + "loss": 0.6029, + "step": 7259 + }, + { + "epoch": 2.680039379768644, + "grad_norm": 0.8992957472801208, + "learning_rate": 2.9422076234536245e-06, + "loss": 0.6305, + "step": 7260 + }, + { + "epoch": 2.68040856509968, + "grad_norm": 0.8826090693473816, + "learning_rate": 2.941728909422707e-06, + "loss": 0.6192, + "step": 7261 + }, + { + "epoch": 2.680777750430716, + "grad_norm": 0.8823418021202087, + "learning_rate": 2.94125017867249e-06, + "loss": 0.614, + "step": 7262 + }, + { + "epoch": 2.6811469357617526, + "grad_norm": 0.8962643146514893, + "learning_rate": 2.9407714312210935e-06, + "loss": 0.5892, + "step": 7263 + }, + { + "epoch": 2.6815161210927885, + "grad_norm": 0.8926030397415161, + "learning_rate": 2.9402926670866364e-06, + "loss": 0.6037, + "step": 7264 + }, + { + "epoch": 2.681885306423825, + "grad_norm": 0.8965013027191162, + "learning_rate": 2.939813886287242e-06, + "loss": 0.5957, + "step": 7265 + }, + { + "epoch": 2.682254491754861, + "grad_norm": 0.8784346580505371, + "learning_rate": 2.9393350888410294e-06, + "loss": 0.6123, + "step": 7266 + }, + { + "epoch": 2.682623677085897, + "grad_norm": 0.8569821119308472, + "learning_rate": 2.938856274766124e-06, + "loss": 0.5805, + "step": 7267 + }, + { + "epoch": 2.6829928624169335, + "grad_norm": 0.895363450050354, + "learning_rate": 2.9383774440806456e-06, + "loss": 0.6008, + "step": 7268 + }, + { + "epoch": 2.6833620477479694, + "grad_norm": 0.9095697402954102, + "learning_rate": 2.9378985968027207e-06, + "loss": 0.6307, + "step": 7269 + }, + { + "epoch": 2.6837312330790057, + "grad_norm": 0.8682095408439636, + "learning_rate": 2.937419732950471e-06, + "loss": 0.5816, + "step": 7270 + }, + { + "epoch": 2.6841004184100417, + "grad_norm": 0.8816158175468445, + "learning_rate": 2.936940852542023e-06, + "loss": 0.5947, + "step": 7271 + }, + { + "epoch": 2.684469603741078, + "grad_norm": 0.8825366497039795, + "learning_rate": 2.936461955595501e-06, + "loss": 0.6151, + "step": 7272 + }, + { + "epoch": 2.6848387890721144, + "grad_norm": 0.8747841715812683, + "learning_rate": 2.9359830421290324e-06, + "loss": 0.5931, + "step": 7273 + }, + { + "epoch": 2.6852079744031503, + "grad_norm": 0.8942915797233582, + "learning_rate": 2.935504112160743e-06, + "loss": 0.6036, + "step": 7274 + }, + { + "epoch": 2.6855771597341866, + "grad_norm": 0.9052845239639282, + "learning_rate": 2.9350251657087607e-06, + "loss": 0.6401, + "step": 7275 + }, + { + "epoch": 2.685946345065223, + "grad_norm": 0.8696044087409973, + "learning_rate": 2.934546202791212e-06, + "loss": 0.5899, + "step": 7276 + }, + { + "epoch": 2.686315530396259, + "grad_norm": 0.9013299345970154, + "learning_rate": 2.934067223426228e-06, + "loss": 0.621, + "step": 7277 + }, + { + "epoch": 2.686684715727295, + "grad_norm": 0.9063544869422913, + "learning_rate": 2.933588227631936e-06, + "loss": 0.6369, + "step": 7278 + }, + { + "epoch": 2.687053901058331, + "grad_norm": 0.8831267952919006, + "learning_rate": 2.933109215426467e-06, + "loss": 0.6232, + "step": 7279 + }, + { + "epoch": 2.6874230863893676, + "grad_norm": 0.9195123314857483, + "learning_rate": 2.93263018682795e-06, + "loss": 0.6819, + "step": 7280 + }, + { + "epoch": 2.6877922717204035, + "grad_norm": 0.884553074836731, + "learning_rate": 2.932151141854518e-06, + "loss": 0.6151, + "step": 7281 + }, + { + "epoch": 2.68816145705144, + "grad_norm": 0.8809378147125244, + "learning_rate": 2.9316720805243004e-06, + "loss": 0.6117, + "step": 7282 + }, + { + "epoch": 2.688530642382476, + "grad_norm": 0.8934864401817322, + "learning_rate": 2.931193002855432e-06, + "loss": 0.6085, + "step": 7283 + }, + { + "epoch": 2.688899827713512, + "grad_norm": 0.8935861587524414, + "learning_rate": 2.930713908866044e-06, + "loss": 0.6104, + "step": 7284 + }, + { + "epoch": 2.6892690130445485, + "grad_norm": 0.8507993817329407, + "learning_rate": 2.9302347985742703e-06, + "loss": 0.6059, + "step": 7285 + }, + { + "epoch": 2.6896381983755844, + "grad_norm": 0.8873221278190613, + "learning_rate": 2.929755671998246e-06, + "loss": 0.6285, + "step": 7286 + }, + { + "epoch": 2.6900073837066207, + "grad_norm": 0.9021348357200623, + "learning_rate": 2.9292765291561043e-06, + "loss": 0.6357, + "step": 7287 + }, + { + "epoch": 2.6903765690376567, + "grad_norm": 0.8779234290122986, + "learning_rate": 2.9287973700659815e-06, + "loss": 0.6093, + "step": 7288 + }, + { + "epoch": 2.690745754368693, + "grad_norm": 0.8970140814781189, + "learning_rate": 2.9283181947460142e-06, + "loss": 0.6284, + "step": 7289 + }, + { + "epoch": 2.6911149396997294, + "grad_norm": 0.8575450778007507, + "learning_rate": 2.9278390032143377e-06, + "loss": 0.628, + "step": 7290 + }, + { + "epoch": 2.6914841250307653, + "grad_norm": 0.9026432037353516, + "learning_rate": 2.927359795489091e-06, + "loss": 0.6291, + "step": 7291 + }, + { + "epoch": 2.6918533103618016, + "grad_norm": 0.8984084725379944, + "learning_rate": 2.9268805715884095e-06, + "loss": 0.6136, + "step": 7292 + }, + { + "epoch": 2.692222495692838, + "grad_norm": 0.8982582092285156, + "learning_rate": 2.926401331530435e-06, + "loss": 0.6016, + "step": 7293 + }, + { + "epoch": 2.692591681023874, + "grad_norm": 0.8907396197319031, + "learning_rate": 2.9259220753333027e-06, + "loss": 0.6391, + "step": 7294 + }, + { + "epoch": 2.6929608663549103, + "grad_norm": 0.9214664101600647, + "learning_rate": 2.925442803015155e-06, + "loss": 0.6375, + "step": 7295 + }, + { + "epoch": 2.693330051685946, + "grad_norm": 0.9231593012809753, + "learning_rate": 2.9249635145941313e-06, + "loss": 0.6345, + "step": 7296 + }, + { + "epoch": 2.6936992370169826, + "grad_norm": 0.8889793157577515, + "learning_rate": 2.924484210088373e-06, + "loss": 0.5583, + "step": 7297 + }, + { + "epoch": 2.6940684223480185, + "grad_norm": 0.9189200401306152, + "learning_rate": 2.924004889516021e-06, + "loss": 0.6135, + "step": 7298 + }, + { + "epoch": 2.694437607679055, + "grad_norm": 0.8701145648956299, + "learning_rate": 2.9235255528952185e-06, + "loss": 0.6065, + "step": 7299 + }, + { + "epoch": 2.694806793010091, + "grad_norm": 0.8807947635650635, + "learning_rate": 2.923046200244107e-06, + "loss": 0.6165, + "step": 7300 + }, + { + "epoch": 2.695175978341127, + "grad_norm": 0.8833128809928894, + "learning_rate": 2.9225668315808302e-06, + "loss": 0.6133, + "step": 7301 + }, + { + "epoch": 2.6955451636721635, + "grad_norm": 0.8917711973190308, + "learning_rate": 2.9220874469235322e-06, + "loss": 0.6184, + "step": 7302 + }, + { + "epoch": 2.6959143490032, + "grad_norm": 0.8615773320198059, + "learning_rate": 2.9216080462903577e-06, + "loss": 0.5795, + "step": 7303 + }, + { + "epoch": 2.6962835343342357, + "grad_norm": 0.8615990281105042, + "learning_rate": 2.9211286296994513e-06, + "loss": 0.5964, + "step": 7304 + }, + { + "epoch": 2.696652719665272, + "grad_norm": 0.8947790265083313, + "learning_rate": 2.92064919716896e-06, + "loss": 0.6219, + "step": 7305 + }, + { + "epoch": 2.697021904996308, + "grad_norm": 0.9198167324066162, + "learning_rate": 2.920169748717029e-06, + "loss": 0.6219, + "step": 7306 + }, + { + "epoch": 2.6973910903273444, + "grad_norm": 0.895672082901001, + "learning_rate": 2.919690284361806e-06, + "loss": 0.6453, + "step": 7307 + }, + { + "epoch": 2.6977602756583803, + "grad_norm": 0.9112757444381714, + "learning_rate": 2.919210804121437e-06, + "loss": 0.6407, + "step": 7308 + }, + { + "epoch": 2.6981294609894166, + "grad_norm": 0.8756322860717773, + "learning_rate": 2.9187313080140724e-06, + "loss": 0.6185, + "step": 7309 + }, + { + "epoch": 2.698498646320453, + "grad_norm": 0.9165353178977966, + "learning_rate": 2.9182517960578593e-06, + "loss": 0.6426, + "step": 7310 + }, + { + "epoch": 2.698867831651489, + "grad_norm": 0.8876410126686096, + "learning_rate": 2.9177722682709485e-06, + "loss": 0.6118, + "step": 7311 + }, + { + "epoch": 2.6992370169825253, + "grad_norm": 0.8988224267959595, + "learning_rate": 2.917292724671488e-06, + "loss": 0.6074, + "step": 7312 + }, + { + "epoch": 2.6996062023135616, + "grad_norm": 0.8767006993293762, + "learning_rate": 2.9168131652776303e-06, + "loss": 0.6366, + "step": 7313 + }, + { + "epoch": 2.6999753876445975, + "grad_norm": 0.9112324118614197, + "learning_rate": 2.916333590107526e-06, + "loss": 0.6587, + "step": 7314 + }, + { + "epoch": 2.700344572975634, + "grad_norm": 0.9117581248283386, + "learning_rate": 2.9158539991793266e-06, + "loss": 0.6493, + "step": 7315 + }, + { + "epoch": 2.70071375830667, + "grad_norm": 0.8963902592658997, + "learning_rate": 2.9153743925111843e-06, + "loss": 0.6132, + "step": 7316 + }, + { + "epoch": 2.701082943637706, + "grad_norm": 0.9004045128822327, + "learning_rate": 2.9148947701212533e-06, + "loss": 0.5986, + "step": 7317 + }, + { + "epoch": 2.701452128968742, + "grad_norm": 0.8796491622924805, + "learning_rate": 2.9144151320276855e-06, + "loss": 0.5925, + "step": 7318 + }, + { + "epoch": 2.7018213142997785, + "grad_norm": 0.905889630317688, + "learning_rate": 2.913935478248636e-06, + "loss": 0.6213, + "step": 7319 + }, + { + "epoch": 2.702190499630815, + "grad_norm": 0.860599160194397, + "learning_rate": 2.913455808802258e-06, + "loss": 0.5996, + "step": 7320 + }, + { + "epoch": 2.7025596849618507, + "grad_norm": 0.8875513672828674, + "learning_rate": 2.91297612370671e-06, + "loss": 0.5866, + "step": 7321 + }, + { + "epoch": 2.702928870292887, + "grad_norm": 0.8946561217308044, + "learning_rate": 2.912496422980145e-06, + "loss": 0.6223, + "step": 7322 + }, + { + "epoch": 2.7032980556239234, + "grad_norm": 0.8587249517440796, + "learning_rate": 2.91201670664072e-06, + "loss": 0.599, + "step": 7323 + }, + { + "epoch": 2.7036672409549594, + "grad_norm": 0.878380537033081, + "learning_rate": 2.9115369747065937e-06, + "loss": 0.6031, + "step": 7324 + }, + { + "epoch": 2.7040364262859957, + "grad_norm": 0.8634481430053711, + "learning_rate": 2.9110572271959233e-06, + "loss": 0.6244, + "step": 7325 + }, + { + "epoch": 2.7044056116170316, + "grad_norm": 0.9075191617012024, + "learning_rate": 2.9105774641268653e-06, + "loss": 0.5968, + "step": 7326 + }, + { + "epoch": 2.704774796948068, + "grad_norm": 0.9061803817749023, + "learning_rate": 2.9100976855175806e-06, + "loss": 0.6061, + "step": 7327 + }, + { + "epoch": 2.705143982279104, + "grad_norm": 0.9082587361335754, + "learning_rate": 2.909617891386228e-06, + "loss": 0.6556, + "step": 7328 + }, + { + "epoch": 2.7055131676101403, + "grad_norm": 0.9101486802101135, + "learning_rate": 2.909138081750967e-06, + "loss": 0.6008, + "step": 7329 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 0.8290569186210632, + "learning_rate": 2.9086582566299587e-06, + "loss": 0.5522, + "step": 7330 + }, + { + "epoch": 2.7062515382722125, + "grad_norm": 0.8803741335868835, + "learning_rate": 2.9081784160413653e-06, + "loss": 0.6045, + "step": 7331 + }, + { + "epoch": 2.706620723603249, + "grad_norm": 0.9025143980979919, + "learning_rate": 2.907698560003346e-06, + "loss": 0.6333, + "step": 7332 + }, + { + "epoch": 2.7069899089342853, + "grad_norm": 0.8912104964256287, + "learning_rate": 2.907218688534066e-06, + "loss": 0.5963, + "step": 7333 + }, + { + "epoch": 2.707359094265321, + "grad_norm": 0.8887360095977783, + "learning_rate": 2.906738801651687e-06, + "loss": 0.5911, + "step": 7334 + }, + { + "epoch": 2.7077282795963575, + "grad_norm": 0.9258095026016235, + "learning_rate": 2.906258899374373e-06, + "loss": 0.6469, + "step": 7335 + }, + { + "epoch": 2.7080974649273934, + "grad_norm": 0.8834803700447083, + "learning_rate": 2.9057789817202877e-06, + "loss": 0.6298, + "step": 7336 + }, + { + "epoch": 2.70846665025843, + "grad_norm": 0.8943313360214233, + "learning_rate": 2.9052990487075955e-06, + "loss": 0.6005, + "step": 7337 + }, + { + "epoch": 2.7088358355894657, + "grad_norm": 0.9252632856369019, + "learning_rate": 2.9048191003544625e-06, + "loss": 0.6436, + "step": 7338 + }, + { + "epoch": 2.709205020920502, + "grad_norm": 0.9020901918411255, + "learning_rate": 2.904339136679055e-06, + "loss": 0.6672, + "step": 7339 + }, + { + "epoch": 2.7095742062515384, + "grad_norm": 0.8270490765571594, + "learning_rate": 2.903859157699538e-06, + "loss": 0.5744, + "step": 7340 + }, + { + "epoch": 2.7099433915825744, + "grad_norm": 0.8775575160980225, + "learning_rate": 2.90337916343408e-06, + "loss": 0.5967, + "step": 7341 + }, + { + "epoch": 2.7103125769136107, + "grad_norm": 0.8910898566246033, + "learning_rate": 2.902899153900847e-06, + "loss": 0.6349, + "step": 7342 + }, + { + "epoch": 2.710681762244647, + "grad_norm": 0.8816712498664856, + "learning_rate": 2.902419129118009e-06, + "loss": 0.5831, + "step": 7343 + }, + { + "epoch": 2.711050947575683, + "grad_norm": 0.9172240495681763, + "learning_rate": 2.9019390891037335e-06, + "loss": 0.6208, + "step": 7344 + }, + { + "epoch": 2.711420132906719, + "grad_norm": 0.8647044897079468, + "learning_rate": 2.9014590338761904e-06, + "loss": 0.6101, + "step": 7345 + }, + { + "epoch": 2.7117893182377553, + "grad_norm": 0.9963017106056213, + "learning_rate": 2.90097896345355e-06, + "loss": 0.5993, + "step": 7346 + }, + { + "epoch": 2.7121585035687916, + "grad_norm": 0.9055141806602478, + "learning_rate": 2.900498877853982e-06, + "loss": 0.6189, + "step": 7347 + }, + { + "epoch": 2.7125276888998275, + "grad_norm": 0.8746891617774963, + "learning_rate": 2.900018777095658e-06, + "loss": 0.6014, + "step": 7348 + }, + { + "epoch": 2.712896874230864, + "grad_norm": 0.8471139073371887, + "learning_rate": 2.8995386611967495e-06, + "loss": 0.5546, + "step": 7349 + }, + { + "epoch": 2.7132660595619003, + "grad_norm": 0.8952683806419373, + "learning_rate": 2.8990585301754286e-06, + "loss": 0.6145, + "step": 7350 + }, + { + "epoch": 2.713635244892936, + "grad_norm": 0.9049074053764343, + "learning_rate": 2.8985783840498682e-06, + "loss": 0.6044, + "step": 7351 + }, + { + "epoch": 2.7140044302239725, + "grad_norm": 0.9252153038978577, + "learning_rate": 2.898098222838243e-06, + "loss": 0.617, + "step": 7352 + }, + { + "epoch": 2.714373615555009, + "grad_norm": 0.8822187185287476, + "learning_rate": 2.8976180465587246e-06, + "loss": 0.6027, + "step": 7353 + }, + { + "epoch": 2.714742800886045, + "grad_norm": 0.904538094997406, + "learning_rate": 2.8971378552294883e-06, + "loss": 0.6566, + "step": 7354 + }, + { + "epoch": 2.7151119862170807, + "grad_norm": 0.9052642583847046, + "learning_rate": 2.8966576488687105e-06, + "loss": 0.6553, + "step": 7355 + }, + { + "epoch": 2.715481171548117, + "grad_norm": 0.874727725982666, + "learning_rate": 2.8961774274945646e-06, + "loss": 0.6074, + "step": 7356 + }, + { + "epoch": 2.7158503568791534, + "grad_norm": 0.8920708894729614, + "learning_rate": 2.895697191125229e-06, + "loss": 0.6059, + "step": 7357 + }, + { + "epoch": 2.7162195422101894, + "grad_norm": 0.8720294237136841, + "learning_rate": 2.8952169397788794e-06, + "loss": 0.6045, + "step": 7358 + }, + { + "epoch": 2.7165887275412257, + "grad_norm": 0.8868765830993652, + "learning_rate": 2.894736673473694e-06, + "loss": 0.6124, + "step": 7359 + }, + { + "epoch": 2.716957912872262, + "grad_norm": 0.8576301336288452, + "learning_rate": 2.8942563922278487e-06, + "loss": 0.5892, + "step": 7360 + }, + { + "epoch": 2.717327098203298, + "grad_norm": 0.8885039687156677, + "learning_rate": 2.8937760960595242e-06, + "loss": 0.6474, + "step": 7361 + }, + { + "epoch": 2.7176962835343343, + "grad_norm": 0.8791964650154114, + "learning_rate": 2.893295784986899e-06, + "loss": 0.5932, + "step": 7362 + }, + { + "epoch": 2.7180654688653703, + "grad_norm": 0.8917666077613831, + "learning_rate": 2.892815459028152e-06, + "loss": 0.603, + "step": 7363 + }, + { + "epoch": 2.7184346541964066, + "grad_norm": 0.8646429777145386, + "learning_rate": 2.892335118201464e-06, + "loss": 0.5781, + "step": 7364 + }, + { + "epoch": 2.7188038395274425, + "grad_norm": 0.9021075963973999, + "learning_rate": 2.891854762525016e-06, + "loss": 0.6507, + "step": 7365 + }, + { + "epoch": 2.719173024858479, + "grad_norm": 0.878610372543335, + "learning_rate": 2.8913743920169883e-06, + "loss": 0.635, + "step": 7366 + }, + { + "epoch": 2.7195422101895153, + "grad_norm": 0.9274274110794067, + "learning_rate": 2.8908940066955638e-06, + "loss": 0.6012, + "step": 7367 + }, + { + "epoch": 2.719911395520551, + "grad_norm": 0.9562507271766663, + "learning_rate": 2.890413606578924e-06, + "loss": 0.6297, + "step": 7368 + }, + { + "epoch": 2.7202805808515875, + "grad_norm": 0.9183669090270996, + "learning_rate": 2.889933191685253e-06, + "loss": 0.6485, + "step": 7369 + }, + { + "epoch": 2.720649766182624, + "grad_norm": 0.8883679509162903, + "learning_rate": 2.8894527620327336e-06, + "loss": 0.5925, + "step": 7370 + }, + { + "epoch": 2.72101895151366, + "grad_norm": 0.8958545327186584, + "learning_rate": 2.8889723176395497e-06, + "loss": 0.5932, + "step": 7371 + }, + { + "epoch": 2.721388136844696, + "grad_norm": 0.8801325559616089, + "learning_rate": 2.8884918585238868e-06, + "loss": 0.6184, + "step": 7372 + }, + { + "epoch": 2.721757322175732, + "grad_norm": 0.8862446546554565, + "learning_rate": 2.888011384703929e-06, + "loss": 0.6258, + "step": 7373 + }, + { + "epoch": 2.7221265075067684, + "grad_norm": 0.9016314148902893, + "learning_rate": 2.8875308961978637e-06, + "loss": 0.6189, + "step": 7374 + }, + { + "epoch": 2.7224956928378043, + "grad_norm": 0.8934279084205627, + "learning_rate": 2.887050393023876e-06, + "loss": 0.6055, + "step": 7375 + }, + { + "epoch": 2.7228648781688407, + "grad_norm": 0.8980122804641724, + "learning_rate": 2.8865698752001527e-06, + "loss": 0.6365, + "step": 7376 + }, + { + "epoch": 2.723234063499877, + "grad_norm": 0.8942857384681702, + "learning_rate": 2.8860893427448815e-06, + "loss": 0.6263, + "step": 7377 + }, + { + "epoch": 2.723603248830913, + "grad_norm": 0.8781708478927612, + "learning_rate": 2.8856087956762506e-06, + "loss": 0.5694, + "step": 7378 + }, + { + "epoch": 2.7239724341619493, + "grad_norm": 0.8834601044654846, + "learning_rate": 2.885128234012449e-06, + "loss": 0.5802, + "step": 7379 + }, + { + "epoch": 2.7243416194929857, + "grad_norm": 0.9179084300994873, + "learning_rate": 2.8846476577716655e-06, + "loss": 0.613, + "step": 7380 + }, + { + "epoch": 2.7247108048240216, + "grad_norm": 0.8791571259498596, + "learning_rate": 2.884167066972089e-06, + "loss": 0.6229, + "step": 7381 + }, + { + "epoch": 2.725079990155058, + "grad_norm": 0.8860880136489868, + "learning_rate": 2.8836864616319106e-06, + "loss": 0.6299, + "step": 7382 + }, + { + "epoch": 2.725449175486094, + "grad_norm": 0.8793582916259766, + "learning_rate": 2.88320584176932e-06, + "loss": 0.6064, + "step": 7383 + }, + { + "epoch": 2.7258183608171302, + "grad_norm": 0.8668330907821655, + "learning_rate": 2.8827252074025096e-06, + "loss": 0.6029, + "step": 7384 + }, + { + "epoch": 2.726187546148166, + "grad_norm": 0.8840840458869934, + "learning_rate": 2.8822445585496704e-06, + "loss": 0.6007, + "step": 7385 + }, + { + "epoch": 2.7265567314792025, + "grad_norm": 0.9056491851806641, + "learning_rate": 2.8817638952289967e-06, + "loss": 0.6361, + "step": 7386 + }, + { + "epoch": 2.726925916810239, + "grad_norm": 0.8725767135620117, + "learning_rate": 2.881283217458679e-06, + "loss": 0.6115, + "step": 7387 + }, + { + "epoch": 2.727295102141275, + "grad_norm": 0.89579838514328, + "learning_rate": 2.880802525256912e-06, + "loss": 0.5849, + "step": 7388 + }, + { + "epoch": 2.727664287472311, + "grad_norm": 0.8796793222427368, + "learning_rate": 2.8803218186418895e-06, + "loss": 0.6162, + "step": 7389 + }, + { + "epoch": 2.7280334728033475, + "grad_norm": 0.8905147314071655, + "learning_rate": 2.879841097631807e-06, + "loss": 0.6287, + "step": 7390 + }, + { + "epoch": 2.7284026581343834, + "grad_norm": 0.9051320552825928, + "learning_rate": 2.879360362244858e-06, + "loss": 0.6053, + "step": 7391 + }, + { + "epoch": 2.72877184346542, + "grad_norm": 0.935868501663208, + "learning_rate": 2.878879612499239e-06, + "loss": 0.6109, + "step": 7392 + }, + { + "epoch": 2.7291410287964557, + "grad_norm": 0.8729000687599182, + "learning_rate": 2.8783988484131466e-06, + "loss": 0.6595, + "step": 7393 + }, + { + "epoch": 2.729510214127492, + "grad_norm": 0.896207332611084, + "learning_rate": 2.877918070004777e-06, + "loss": 0.6101, + "step": 7394 + }, + { + "epoch": 2.729879399458528, + "grad_norm": 0.9036887288093567, + "learning_rate": 2.8774372772923278e-06, + "loss": 0.6393, + "step": 7395 + }, + { + "epoch": 2.7302485847895643, + "grad_norm": 0.8905741572380066, + "learning_rate": 2.8769564702939967e-06, + "loss": 0.6015, + "step": 7396 + }, + { + "epoch": 2.7306177701206007, + "grad_norm": 0.9046378135681152, + "learning_rate": 2.876475649027982e-06, + "loss": 0.6074, + "step": 7397 + }, + { + "epoch": 2.7309869554516366, + "grad_norm": 0.926521360874176, + "learning_rate": 2.875994813512483e-06, + "loss": 0.6145, + "step": 7398 + }, + { + "epoch": 2.731356140782673, + "grad_norm": 0.9077001214027405, + "learning_rate": 2.8755139637656993e-06, + "loss": 0.6239, + "step": 7399 + }, + { + "epoch": 2.7317253261137093, + "grad_norm": 0.9073883295059204, + "learning_rate": 2.8750330998058306e-06, + "loss": 0.6123, + "step": 7400 + }, + { + "epoch": 2.7320945114447452, + "grad_norm": 0.8695136308670044, + "learning_rate": 2.8745522216510767e-06, + "loss": 0.5848, + "step": 7401 + }, + { + "epoch": 2.7324636967757816, + "grad_norm": 0.8569779992103577, + "learning_rate": 2.8740713293196404e-06, + "loss": 0.6082, + "step": 7402 + }, + { + "epoch": 2.7328328821068175, + "grad_norm": 0.9035912752151489, + "learning_rate": 2.873590422829722e-06, + "loss": 0.6254, + "step": 7403 + }, + { + "epoch": 2.733202067437854, + "grad_norm": 0.8851427435874939, + "learning_rate": 2.8731095021995233e-06, + "loss": 0.609, + "step": 7404 + }, + { + "epoch": 2.73357125276889, + "grad_norm": 0.8961101770401001, + "learning_rate": 2.872628567447248e-06, + "loss": 0.6207, + "step": 7405 + }, + { + "epoch": 2.733940438099926, + "grad_norm": 0.9049040675163269, + "learning_rate": 2.8721476185910995e-06, + "loss": 0.6012, + "step": 7406 + }, + { + "epoch": 2.7343096234309625, + "grad_norm": 0.8714366555213928, + "learning_rate": 2.871666655649281e-06, + "loss": 0.6085, + "step": 7407 + }, + { + "epoch": 2.7346788087619984, + "grad_norm": 0.8991366028785706, + "learning_rate": 2.871185678639996e-06, + "loss": 0.6158, + "step": 7408 + }, + { + "epoch": 2.735047994093035, + "grad_norm": 0.8811948895454407, + "learning_rate": 2.870704687581451e-06, + "loss": 0.6081, + "step": 7409 + }, + { + "epoch": 2.735417179424071, + "grad_norm": 0.8820156455039978, + "learning_rate": 2.870223682491851e-06, + "loss": 0.5942, + "step": 7410 + }, + { + "epoch": 2.735786364755107, + "grad_norm": 0.914517879486084, + "learning_rate": 2.869742663389401e-06, + "loss": 0.5899, + "step": 7411 + }, + { + "epoch": 2.736155550086143, + "grad_norm": 0.8935008645057678, + "learning_rate": 2.8692616302923083e-06, + "loss": 0.6156, + "step": 7412 + }, + { + "epoch": 2.7365247354171793, + "grad_norm": 0.8818865418434143, + "learning_rate": 2.8687805832187784e-06, + "loss": 0.6229, + "step": 7413 + }, + { + "epoch": 2.7368939207482157, + "grad_norm": 0.8917115330696106, + "learning_rate": 2.86829952218702e-06, + "loss": 0.6257, + "step": 7414 + }, + { + "epoch": 2.7372631060792516, + "grad_norm": 0.8779562711715698, + "learning_rate": 2.8678184472152417e-06, + "loss": 0.6171, + "step": 7415 + }, + { + "epoch": 2.737632291410288, + "grad_norm": 0.889207661151886, + "learning_rate": 2.8673373583216513e-06, + "loss": 0.6262, + "step": 7416 + }, + { + "epoch": 2.7380014767413243, + "grad_norm": 0.8798189163208008, + "learning_rate": 2.8668562555244572e-06, + "loss": 0.5929, + "step": 7417 + }, + { + "epoch": 2.7383706620723602, + "grad_norm": 0.9012354612350464, + "learning_rate": 2.8663751388418694e-06, + "loss": 0.6067, + "step": 7418 + }, + { + "epoch": 2.7387398474033966, + "grad_norm": 0.8914659023284912, + "learning_rate": 2.8658940082920983e-06, + "loss": 0.6161, + "step": 7419 + }, + { + "epoch": 2.739109032734433, + "grad_norm": 0.8850328922271729, + "learning_rate": 2.8654128638933554e-06, + "loss": 0.6228, + "step": 7420 + }, + { + "epoch": 2.739478218065469, + "grad_norm": 0.9300819635391235, + "learning_rate": 2.86493170566385e-06, + "loss": 0.6591, + "step": 7421 + }, + { + "epoch": 2.739847403396505, + "grad_norm": 0.8815993070602417, + "learning_rate": 2.8644505336217953e-06, + "loss": 0.6357, + "step": 7422 + }, + { + "epoch": 2.740216588727541, + "grad_norm": 0.9077354669570923, + "learning_rate": 2.863969347785402e-06, + "loss": 0.6255, + "step": 7423 + }, + { + "epoch": 2.7405857740585775, + "grad_norm": 0.8906066417694092, + "learning_rate": 2.863488148172885e-06, + "loss": 0.6195, + "step": 7424 + }, + { + "epoch": 2.7409549593896134, + "grad_norm": 0.8915907740592957, + "learning_rate": 2.863006934802455e-06, + "loss": 0.6429, + "step": 7425 + }, + { + "epoch": 2.74132414472065, + "grad_norm": 0.9036286473274231, + "learning_rate": 2.8625257076923286e-06, + "loss": 0.6236, + "step": 7426 + }, + { + "epoch": 2.741693330051686, + "grad_norm": 0.9041144847869873, + "learning_rate": 2.8620444668607172e-06, + "loss": 0.6181, + "step": 7427 + }, + { + "epoch": 2.742062515382722, + "grad_norm": 0.9033268094062805, + "learning_rate": 2.861563212325838e-06, + "loss": 0.6302, + "step": 7428 + }, + { + "epoch": 2.7424317007137584, + "grad_norm": 0.8952858448028564, + "learning_rate": 2.8610819441059047e-06, + "loss": 0.6193, + "step": 7429 + }, + { + "epoch": 2.7428008860447943, + "grad_norm": 0.8621241450309753, + "learning_rate": 2.860600662219134e-06, + "loss": 0.5754, + "step": 7430 + }, + { + "epoch": 2.7431700713758307, + "grad_norm": 0.884795069694519, + "learning_rate": 2.8601193666837425e-06, + "loss": 0.5559, + "step": 7431 + }, + { + "epoch": 2.7435392567068666, + "grad_norm": 0.9023098349571228, + "learning_rate": 2.8596380575179454e-06, + "loss": 0.6202, + "step": 7432 + }, + { + "epoch": 2.743908442037903, + "grad_norm": 0.9047364592552185, + "learning_rate": 2.8591567347399617e-06, + "loss": 0.6456, + "step": 7433 + }, + { + "epoch": 2.7442776273689393, + "grad_norm": 0.8986683487892151, + "learning_rate": 2.8586753983680097e-06, + "loss": 0.6058, + "step": 7434 + }, + { + "epoch": 2.7446468126999752, + "grad_norm": 0.9158204793930054, + "learning_rate": 2.858194048420307e-06, + "loss": 0.6165, + "step": 7435 + }, + { + "epoch": 2.7450159980310116, + "grad_norm": 0.9163082838058472, + "learning_rate": 2.8577126849150726e-06, + "loss": 0.6183, + "step": 7436 + }, + { + "epoch": 2.745385183362048, + "grad_norm": 0.9154778718948364, + "learning_rate": 2.857231307870525e-06, + "loss": 0.625, + "step": 7437 + }, + { + "epoch": 2.745754368693084, + "grad_norm": 0.9411607980728149, + "learning_rate": 2.8567499173048864e-06, + "loss": 0.6091, + "step": 7438 + }, + { + "epoch": 2.7461235540241202, + "grad_norm": 0.8563213348388672, + "learning_rate": 2.856268513236375e-06, + "loss": 0.6262, + "step": 7439 + }, + { + "epoch": 2.746492739355156, + "grad_norm": 0.9226893186569214, + "learning_rate": 2.8557870956832135e-06, + "loss": 0.5725, + "step": 7440 + }, + { + "epoch": 2.7468619246861925, + "grad_norm": 0.9089175462722778, + "learning_rate": 2.855305664663622e-06, + "loss": 0.6486, + "step": 7441 + }, + { + "epoch": 2.7472311100172284, + "grad_norm": 0.8928477168083191, + "learning_rate": 2.854824220195824e-06, + "loss": 0.5969, + "step": 7442 + }, + { + "epoch": 2.7476002953482648, + "grad_norm": 0.9113876819610596, + "learning_rate": 2.8543427622980407e-06, + "loss": 0.6741, + "step": 7443 + }, + { + "epoch": 2.747969480679301, + "grad_norm": 0.8933296799659729, + "learning_rate": 2.8538612909884962e-06, + "loss": 0.6319, + "step": 7444 + }, + { + "epoch": 2.748338666010337, + "grad_norm": 0.92194002866745, + "learning_rate": 2.8533798062854136e-06, + "loss": 0.6373, + "step": 7445 + }, + { + "epoch": 2.7487078513413734, + "grad_norm": 0.8939220905303955, + "learning_rate": 2.852898308207016e-06, + "loss": 0.5974, + "step": 7446 + }, + { + "epoch": 2.7490770366724098, + "grad_norm": 0.8946089148521423, + "learning_rate": 2.852416796771529e-06, + "loss": 0.5899, + "step": 7447 + }, + { + "epoch": 2.7494462220034457, + "grad_norm": 0.924601674079895, + "learning_rate": 2.8519352719971783e-06, + "loss": 0.6252, + "step": 7448 + }, + { + "epoch": 2.749815407334482, + "grad_norm": 0.9017215371131897, + "learning_rate": 2.8514537339021874e-06, + "loss": 0.6089, + "step": 7449 + }, + { + "epoch": 2.750184592665518, + "grad_norm": 0.9330711364746094, + "learning_rate": 2.8509721825047855e-06, + "loss": 0.6128, + "step": 7450 + }, + { + "epoch": 2.7505537779965543, + "grad_norm": 0.8856083154678345, + "learning_rate": 2.850490617823195e-06, + "loss": 0.5541, + "step": 7451 + }, + { + "epoch": 2.7509229633275902, + "grad_norm": 0.8610450625419617, + "learning_rate": 2.8500090398756462e-06, + "loss": 0.6116, + "step": 7452 + }, + { + "epoch": 2.7512921486586266, + "grad_norm": 0.8962690830230713, + "learning_rate": 2.8495274486803657e-06, + "loss": 0.6421, + "step": 7453 + }, + { + "epoch": 2.751661333989663, + "grad_norm": 0.8772549033164978, + "learning_rate": 2.8490458442555825e-06, + "loss": 0.6365, + "step": 7454 + }, + { + "epoch": 2.752030519320699, + "grad_norm": 0.8917569518089294, + "learning_rate": 2.848564226619523e-06, + "loss": 0.6201, + "step": 7455 + }, + { + "epoch": 2.7523997046517352, + "grad_norm": 0.8887903690338135, + "learning_rate": 2.848082595790418e-06, + "loss": 0.6306, + "step": 7456 + }, + { + "epoch": 2.7527688899827716, + "grad_norm": 0.8663029670715332, + "learning_rate": 2.847600951786496e-06, + "loss": 0.6108, + "step": 7457 + }, + { + "epoch": 2.7531380753138075, + "grad_norm": 0.8934242129325867, + "learning_rate": 2.847119294625989e-06, + "loss": 0.5742, + "step": 7458 + }, + { + "epoch": 2.753507260644844, + "grad_norm": 0.8935680389404297, + "learning_rate": 2.8466376243271255e-06, + "loss": 0.6446, + "step": 7459 + }, + { + "epoch": 2.7538764459758798, + "grad_norm": 0.8898221254348755, + "learning_rate": 2.8461559409081373e-06, + "loss": 0.5866, + "step": 7460 + }, + { + "epoch": 2.754245631306916, + "grad_norm": 0.8852329254150391, + "learning_rate": 2.8456742443872553e-06, + "loss": 0.6228, + "step": 7461 + }, + { + "epoch": 2.754614816637952, + "grad_norm": 0.8945639729499817, + "learning_rate": 2.8451925347827136e-06, + "loss": 0.6298, + "step": 7462 + }, + { + "epoch": 2.7549840019689884, + "grad_norm": 0.8945268988609314, + "learning_rate": 2.8447108121127425e-06, + "loss": 0.6179, + "step": 7463 + }, + { + "epoch": 2.7553531873000248, + "grad_norm": 0.9116594791412354, + "learning_rate": 2.8442290763955765e-06, + "loss": 0.6239, + "step": 7464 + }, + { + "epoch": 2.7557223726310607, + "grad_norm": 0.9032543301582336, + "learning_rate": 2.8437473276494475e-06, + "loss": 0.6451, + "step": 7465 + }, + { + "epoch": 2.756091557962097, + "grad_norm": 0.9013664722442627, + "learning_rate": 2.8432655658925918e-06, + "loss": 0.612, + "step": 7466 + }, + { + "epoch": 2.7564607432931334, + "grad_norm": 0.8982788920402527, + "learning_rate": 2.842783791143242e-06, + "loss": 0.6406, + "step": 7467 + }, + { + "epoch": 2.7568299286241693, + "grad_norm": 0.9060657620429993, + "learning_rate": 2.8423020034196354e-06, + "loss": 0.6315, + "step": 7468 + }, + { + "epoch": 2.7571991139552057, + "grad_norm": 0.9017001986503601, + "learning_rate": 2.8418202027400043e-06, + "loss": 0.6231, + "step": 7469 + }, + { + "epoch": 2.7575682992862416, + "grad_norm": 0.8928288817405701, + "learning_rate": 2.841338389122587e-06, + "loss": 0.6137, + "step": 7470 + }, + { + "epoch": 2.757937484617278, + "grad_norm": 0.8735109567642212, + "learning_rate": 2.8408565625856195e-06, + "loss": 0.6141, + "step": 7471 + }, + { + "epoch": 2.758306669948314, + "grad_norm": 0.8970789909362793, + "learning_rate": 2.8403747231473387e-06, + "loss": 0.629, + "step": 7472 + }, + { + "epoch": 2.75867585527935, + "grad_norm": 0.8629143834114075, + "learning_rate": 2.8398928708259815e-06, + "loss": 0.5814, + "step": 7473 + }, + { + "epoch": 2.7590450406103866, + "grad_norm": 0.8607925772666931, + "learning_rate": 2.839411005639787e-06, + "loss": 0.5996, + "step": 7474 + }, + { + "epoch": 2.7594142259414225, + "grad_norm": 0.8910054564476013, + "learning_rate": 2.8389291276069926e-06, + "loss": 0.6117, + "step": 7475 + }, + { + "epoch": 2.759783411272459, + "grad_norm": 0.8824940919876099, + "learning_rate": 2.8384472367458384e-06, + "loss": 0.5981, + "step": 7476 + }, + { + "epoch": 2.760152596603495, + "grad_norm": 0.8526864051818848, + "learning_rate": 2.8379653330745627e-06, + "loss": 0.5868, + "step": 7477 + }, + { + "epoch": 2.760521781934531, + "grad_norm": 0.8981882929801941, + "learning_rate": 2.837483416611406e-06, + "loss": 0.6055, + "step": 7478 + }, + { + "epoch": 2.760890967265567, + "grad_norm": 0.9094981551170349, + "learning_rate": 2.837001487374608e-06, + "loss": 0.6294, + "step": 7479 + }, + { + "epoch": 2.7612601525966034, + "grad_norm": 0.9100338816642761, + "learning_rate": 2.8365195453824103e-06, + "loss": 0.6072, + "step": 7480 + }, + { + "epoch": 2.7616293379276398, + "grad_norm": 0.9084033966064453, + "learning_rate": 2.836037590653054e-06, + "loss": 0.6081, + "step": 7481 + }, + { + "epoch": 2.7619985232586757, + "grad_norm": 0.9082955121994019, + "learning_rate": 2.8355556232047813e-06, + "loss": 0.6136, + "step": 7482 + }, + { + "epoch": 2.762367708589712, + "grad_norm": 0.8651077747344971, + "learning_rate": 2.8350736430558334e-06, + "loss": 0.5943, + "step": 7483 + }, + { + "epoch": 2.7627368939207484, + "grad_norm": 0.8932151198387146, + "learning_rate": 2.8345916502244548e-06, + "loss": 0.5957, + "step": 7484 + }, + { + "epoch": 2.7631060792517843, + "grad_norm": 0.8920661807060242, + "learning_rate": 2.834109644728887e-06, + "loss": 0.6318, + "step": 7485 + }, + { + "epoch": 2.7634752645828207, + "grad_norm": 0.8805489540100098, + "learning_rate": 2.8336276265873753e-06, + "loss": 0.6045, + "step": 7486 + }, + { + "epoch": 2.763844449913857, + "grad_norm": 0.8947156667709351, + "learning_rate": 2.8331455958181626e-06, + "loss": 0.6013, + "step": 7487 + }, + { + "epoch": 2.764213635244893, + "grad_norm": 0.8837153315544128, + "learning_rate": 2.8326635524394952e-06, + "loss": 0.5878, + "step": 7488 + }, + { + "epoch": 2.764582820575929, + "grad_norm": 0.8659005165100098, + "learning_rate": 2.8321814964696158e-06, + "loss": 0.578, + "step": 7489 + }, + { + "epoch": 2.764952005906965, + "grad_norm": 0.856246829032898, + "learning_rate": 2.8316994279267727e-06, + "loss": 0.6205, + "step": 7490 + }, + { + "epoch": 2.7653211912380016, + "grad_norm": 0.9003512263298035, + "learning_rate": 2.831217346829211e-06, + "loss": 0.625, + "step": 7491 + }, + { + "epoch": 2.7656903765690375, + "grad_norm": 0.9107562899589539, + "learning_rate": 2.830735253195177e-06, + "loss": 0.6453, + "step": 7492 + }, + { + "epoch": 2.766059561900074, + "grad_norm": 0.8895819187164307, + "learning_rate": 2.8302531470429177e-06, + "loss": 0.5668, + "step": 7493 + }, + { + "epoch": 2.76642874723111, + "grad_norm": 0.9030516147613525, + "learning_rate": 2.8297710283906816e-06, + "loss": 0.6385, + "step": 7494 + }, + { + "epoch": 2.766797932562146, + "grad_norm": 0.8919291496276855, + "learning_rate": 2.829288897256716e-06, + "loss": 0.6275, + "step": 7495 + }, + { + "epoch": 2.7671671178931825, + "grad_norm": 0.871766984462738, + "learning_rate": 2.8288067536592696e-06, + "loss": 0.6151, + "step": 7496 + }, + { + "epoch": 2.7675363032242184, + "grad_norm": 0.8745489716529846, + "learning_rate": 2.8283245976165907e-06, + "loss": 0.6238, + "step": 7497 + }, + { + "epoch": 2.7679054885552548, + "grad_norm": 0.9064077138900757, + "learning_rate": 2.8278424291469305e-06, + "loss": 0.5978, + "step": 7498 + }, + { + "epoch": 2.7682746738862907, + "grad_norm": 0.875851571559906, + "learning_rate": 2.8273602482685366e-06, + "loss": 0.6283, + "step": 7499 + }, + { + "epoch": 2.768643859217327, + "grad_norm": 0.921059250831604, + "learning_rate": 2.8268780549996617e-06, + "loss": 0.6118, + "step": 7500 + }, + { + "epoch": 2.7690130445483634, + "grad_norm": 0.8867506980895996, + "learning_rate": 2.826395849358555e-06, + "loss": 0.604, + "step": 7501 + }, + { + "epoch": 2.7693822298793993, + "grad_norm": 0.8739767670631409, + "learning_rate": 2.825913631363468e-06, + "loss": 0.6211, + "step": 7502 + }, + { + "epoch": 2.7697514152104357, + "grad_norm": 0.9111621975898743, + "learning_rate": 2.8254314010326535e-06, + "loss": 0.5917, + "step": 7503 + }, + { + "epoch": 2.770120600541472, + "grad_norm": 0.9103347063064575, + "learning_rate": 2.8249491583843637e-06, + "loss": 0.6298, + "step": 7504 + }, + { + "epoch": 2.770489785872508, + "grad_norm": 0.972912073135376, + "learning_rate": 2.8244669034368498e-06, + "loss": 0.5904, + "step": 7505 + }, + { + "epoch": 2.7708589712035443, + "grad_norm": 0.8796701431274414, + "learning_rate": 2.823984636208367e-06, + "loss": 0.6349, + "step": 7506 + }, + { + "epoch": 2.77122815653458, + "grad_norm": 0.8795666098594666, + "learning_rate": 2.823502356717167e-06, + "loss": 0.5695, + "step": 7507 + }, + { + "epoch": 2.7715973418656166, + "grad_norm": 0.9082992076873779, + "learning_rate": 2.823020064981505e-06, + "loss": 0.6187, + "step": 7508 + }, + { + "epoch": 2.7719665271966525, + "grad_norm": 0.9167087078094482, + "learning_rate": 2.8225377610196357e-06, + "loss": 0.6277, + "step": 7509 + }, + { + "epoch": 2.772335712527689, + "grad_norm": 0.8877707719802856, + "learning_rate": 2.8220554448498144e-06, + "loss": 0.6291, + "step": 7510 + }, + { + "epoch": 2.772704897858725, + "grad_norm": 0.875231921672821, + "learning_rate": 2.821573116490296e-06, + "loss": 0.5661, + "step": 7511 + }, + { + "epoch": 2.773074083189761, + "grad_norm": 0.8936153054237366, + "learning_rate": 2.8210907759593365e-06, + "loss": 0.6479, + "step": 7512 + }, + { + "epoch": 2.7734432685207975, + "grad_norm": 0.9063799381256104, + "learning_rate": 2.820608423275192e-06, + "loss": 0.5978, + "step": 7513 + }, + { + "epoch": 2.773812453851834, + "grad_norm": 0.8832616209983826, + "learning_rate": 2.8201260584561202e-06, + "loss": 0.6362, + "step": 7514 + }, + { + "epoch": 2.7741816391828698, + "grad_norm": 0.9048763513565063, + "learning_rate": 2.819643681520378e-06, + "loss": 0.6331, + "step": 7515 + }, + { + "epoch": 2.774550824513906, + "grad_norm": 0.9219979643821716, + "learning_rate": 2.8191612924862243e-06, + "loss": 0.596, + "step": 7516 + }, + { + "epoch": 2.774920009844942, + "grad_norm": 0.9133176803588867, + "learning_rate": 2.818678891371915e-06, + "loss": 0.605, + "step": 7517 + }, + { + "epoch": 2.7752891951759784, + "grad_norm": 0.8839098811149597, + "learning_rate": 2.818196478195711e-06, + "loss": 0.6134, + "step": 7518 + }, + { + "epoch": 2.7756583805070143, + "grad_norm": 0.8648233413696289, + "learning_rate": 2.8177140529758708e-06, + "loss": 0.5798, + "step": 7519 + }, + { + "epoch": 2.7760275658380507, + "grad_norm": 0.8981606364250183, + "learning_rate": 2.8172316157306535e-06, + "loss": 0.5764, + "step": 7520 + }, + { + "epoch": 2.776396751169087, + "grad_norm": 0.8905041813850403, + "learning_rate": 2.81674916647832e-06, + "loss": 0.6047, + "step": 7521 + }, + { + "epoch": 2.776765936500123, + "grad_norm": 0.9254972338676453, + "learning_rate": 2.81626670523713e-06, + "loss": 0.6193, + "step": 7522 + }, + { + "epoch": 2.7771351218311593, + "grad_norm": 0.8985603451728821, + "learning_rate": 2.8157842320253455e-06, + "loss": 0.63, + "step": 7523 + }, + { + "epoch": 2.7775043071621957, + "grad_norm": 1.2203404903411865, + "learning_rate": 2.8153017468612277e-06, + "loss": 0.6012, + "step": 7524 + }, + { + "epoch": 2.7778734924932316, + "grad_norm": 0.8908824920654297, + "learning_rate": 2.8148192497630377e-06, + "loss": 0.6174, + "step": 7525 + }, + { + "epoch": 2.778242677824268, + "grad_norm": 0.8949019312858582, + "learning_rate": 2.8143367407490395e-06, + "loss": 0.6426, + "step": 7526 + }, + { + "epoch": 2.778611863155304, + "grad_norm": 0.9184598922729492, + "learning_rate": 2.8138542198374936e-06, + "loss": 0.6295, + "step": 7527 + }, + { + "epoch": 2.77898104848634, + "grad_norm": 0.8830899000167847, + "learning_rate": 2.8133716870466655e-06, + "loss": 0.6315, + "step": 7528 + }, + { + "epoch": 2.779350233817376, + "grad_norm": 0.8990486264228821, + "learning_rate": 2.8128891423948173e-06, + "loss": 0.5715, + "step": 7529 + }, + { + "epoch": 2.7797194191484125, + "grad_norm": 0.9081230759620667, + "learning_rate": 2.8124065859002144e-06, + "loss": 0.6187, + "step": 7530 + }, + { + "epoch": 2.780088604479449, + "grad_norm": 0.872272789478302, + "learning_rate": 2.8119240175811204e-06, + "loss": 0.5941, + "step": 7531 + }, + { + "epoch": 2.7804577898104847, + "grad_norm": 0.8677495718002319, + "learning_rate": 2.8114414374558e-06, + "loss": 0.5661, + "step": 7532 + }, + { + "epoch": 2.780826975141521, + "grad_norm": 0.9113672971725464, + "learning_rate": 2.8109588455425207e-06, + "loss": 0.6173, + "step": 7533 + }, + { + "epoch": 2.7811961604725575, + "grad_norm": 0.8956503868103027, + "learning_rate": 2.810476241859547e-06, + "loss": 0.6475, + "step": 7534 + }, + { + "epoch": 2.7815653458035934, + "grad_norm": 0.9333337545394897, + "learning_rate": 2.8099936264251455e-06, + "loss": 0.6213, + "step": 7535 + }, + { + "epoch": 2.7819345311346297, + "grad_norm": 0.916607677936554, + "learning_rate": 2.8095109992575824e-06, + "loss": 0.6204, + "step": 7536 + }, + { + "epoch": 2.7823037164656657, + "grad_norm": 0.8918951153755188, + "learning_rate": 2.809028360375126e-06, + "loss": 0.6039, + "step": 7537 + }, + { + "epoch": 2.782672901796702, + "grad_norm": 0.8847224116325378, + "learning_rate": 2.8085457097960445e-06, + "loss": 0.6058, + "step": 7538 + }, + { + "epoch": 2.783042087127738, + "grad_norm": 0.9759724140167236, + "learning_rate": 2.8080630475386043e-06, + "loss": 0.6144, + "step": 7539 + }, + { + "epoch": 2.7834112724587743, + "grad_norm": 0.8639406561851501, + "learning_rate": 2.807580373621076e-06, + "loss": 0.6058, + "step": 7540 + }, + { + "epoch": 2.7837804577898106, + "grad_norm": 0.8800864219665527, + "learning_rate": 2.807097688061726e-06, + "loss": 0.6307, + "step": 7541 + }, + { + "epoch": 2.7841496431208466, + "grad_norm": 0.9377948045730591, + "learning_rate": 2.806614990878827e-06, + "loss": 0.6262, + "step": 7542 + }, + { + "epoch": 2.784518828451883, + "grad_norm": 0.9103282690048218, + "learning_rate": 2.8061322820906466e-06, + "loss": 0.6153, + "step": 7543 + }, + { + "epoch": 2.7848880137829193, + "grad_norm": 0.8861456513404846, + "learning_rate": 2.8056495617154565e-06, + "loss": 0.6227, + "step": 7544 + }, + { + "epoch": 2.785257199113955, + "grad_norm": 0.8824044466018677, + "learning_rate": 2.805166829771527e-06, + "loss": 0.6127, + "step": 7545 + }, + { + "epoch": 2.785626384444991, + "grad_norm": 0.9306734204292297, + "learning_rate": 2.8046840862771285e-06, + "loss": 0.6179, + "step": 7546 + }, + { + "epoch": 2.7859955697760275, + "grad_norm": 0.8760329484939575, + "learning_rate": 2.8042013312505333e-06, + "loss": 0.5878, + "step": 7547 + }, + { + "epoch": 2.786364755107064, + "grad_norm": 0.8945958614349365, + "learning_rate": 2.803718564710015e-06, + "loss": 0.6396, + "step": 7548 + }, + { + "epoch": 2.7867339404380997, + "grad_norm": 0.9029567241668701, + "learning_rate": 2.803235786673844e-06, + "loss": 0.5935, + "step": 7549 + }, + { + "epoch": 2.787103125769136, + "grad_norm": 0.9238128066062927, + "learning_rate": 2.8027529971602948e-06, + "loss": 0.6128, + "step": 7550 + }, + { + "epoch": 2.7874723111001725, + "grad_norm": 0.9104467034339905, + "learning_rate": 2.802270196187639e-06, + "loss": 0.6369, + "step": 7551 + }, + { + "epoch": 2.7878414964312084, + "grad_norm": 0.8705983757972717, + "learning_rate": 2.8017873837741526e-06, + "loss": 0.5548, + "step": 7552 + }, + { + "epoch": 2.7882106817622447, + "grad_norm": 0.9524829387664795, + "learning_rate": 2.8013045599381088e-06, + "loss": 0.6076, + "step": 7553 + }, + { + "epoch": 2.788579867093281, + "grad_norm": 0.8769269585609436, + "learning_rate": 2.800821724697783e-06, + "loss": 0.6197, + "step": 7554 + }, + { + "epoch": 2.788949052424317, + "grad_norm": 0.9126443862915039, + "learning_rate": 2.8003388780714484e-06, + "loss": 0.6077, + "step": 7555 + }, + { + "epoch": 2.789318237755353, + "grad_norm": 0.8970649838447571, + "learning_rate": 2.799856020077384e-06, + "loss": 0.6108, + "step": 7556 + }, + { + "epoch": 2.7896874230863893, + "grad_norm": 0.9141059517860413, + "learning_rate": 2.799373150733862e-06, + "loss": 0.6298, + "step": 7557 + }, + { + "epoch": 2.7900566084174256, + "grad_norm": 0.8975700736045837, + "learning_rate": 2.7988902700591624e-06, + "loss": 0.5953, + "step": 7558 + }, + { + "epoch": 2.7904257937484616, + "grad_norm": 0.8784664869308472, + "learning_rate": 2.798407378071559e-06, + "loss": 0.6264, + "step": 7559 + }, + { + "epoch": 2.790794979079498, + "grad_norm": 0.9274550080299377, + "learning_rate": 2.7979244747893313e-06, + "loss": 0.6426, + "step": 7560 + }, + { + "epoch": 2.7911641644105343, + "grad_norm": 0.920070230960846, + "learning_rate": 2.797441560230756e-06, + "loss": 0.6312, + "step": 7561 + }, + { + "epoch": 2.79153334974157, + "grad_norm": 0.8863987326622009, + "learning_rate": 2.796958634414112e-06, + "loss": 0.5981, + "step": 7562 + }, + { + "epoch": 2.7919025350726066, + "grad_norm": 0.8732567429542542, + "learning_rate": 2.7964756973576767e-06, + "loss": 0.624, + "step": 7563 + }, + { + "epoch": 2.7922717204036425, + "grad_norm": 0.9279804825782776, + "learning_rate": 2.7959927490797306e-06, + "loss": 0.6444, + "step": 7564 + }, + { + "epoch": 2.792640905734679, + "grad_norm": 0.9284129738807678, + "learning_rate": 2.7955097895985516e-06, + "loss": 0.6157, + "step": 7565 + }, + { + "epoch": 2.7930100910657147, + "grad_norm": 0.8969399333000183, + "learning_rate": 2.795026818932421e-06, + "loss": 0.6025, + "step": 7566 + }, + { + "epoch": 2.793379276396751, + "grad_norm": 0.8756632804870605, + "learning_rate": 2.794543837099618e-06, + "loss": 0.5944, + "step": 7567 + }, + { + "epoch": 2.7937484617277875, + "grad_norm": 0.9346538186073303, + "learning_rate": 2.794060844118424e-06, + "loss": 0.624, + "step": 7568 + }, + { + "epoch": 2.7941176470588234, + "grad_norm": 0.9083490967750549, + "learning_rate": 2.7935778400071196e-06, + "loss": 0.6417, + "step": 7569 + }, + { + "epoch": 2.7944868323898597, + "grad_norm": 0.8760696053504944, + "learning_rate": 2.793094824783986e-06, + "loss": 0.613, + "step": 7570 + }, + { + "epoch": 2.794856017720896, + "grad_norm": 0.8864263892173767, + "learning_rate": 2.7926117984673068e-06, + "loss": 0.6025, + "step": 7571 + }, + { + "epoch": 2.795225203051932, + "grad_norm": 0.8889302015304565, + "learning_rate": 2.7921287610753634e-06, + "loss": 0.6675, + "step": 7572 + }, + { + "epoch": 2.7955943883829684, + "grad_norm": 0.8919947147369385, + "learning_rate": 2.791645712626438e-06, + "loss": 0.6, + "step": 7573 + }, + { + "epoch": 2.7959635737140043, + "grad_norm": 0.8859171271324158, + "learning_rate": 2.7911626531388143e-06, + "loss": 0.6188, + "step": 7574 + }, + { + "epoch": 2.7963327590450406, + "grad_norm": 0.9192028045654297, + "learning_rate": 2.7906795826307766e-06, + "loss": 0.6522, + "step": 7575 + }, + { + "epoch": 2.7967019443760766, + "grad_norm": 0.8798398971557617, + "learning_rate": 2.7901965011206085e-06, + "loss": 0.5954, + "step": 7576 + }, + { + "epoch": 2.797071129707113, + "grad_norm": 0.907598078250885, + "learning_rate": 2.7897134086265936e-06, + "loss": 0.6246, + "step": 7577 + }, + { + "epoch": 2.7974403150381493, + "grad_norm": 0.8823449015617371, + "learning_rate": 2.789230305167019e-06, + "loss": 0.5864, + "step": 7578 + }, + { + "epoch": 2.797809500369185, + "grad_norm": 1.021831750869751, + "learning_rate": 2.788747190760167e-06, + "loss": 0.6048, + "step": 7579 + }, + { + "epoch": 2.7981786857002215, + "grad_norm": 0.8788116574287415, + "learning_rate": 2.788264065424326e-06, + "loss": 0.6214, + "step": 7580 + }, + { + "epoch": 2.798547871031258, + "grad_norm": 0.8971526026725769, + "learning_rate": 2.7877809291777808e-06, + "loss": 0.5889, + "step": 7581 + }, + { + "epoch": 2.798917056362294, + "grad_norm": 0.86673903465271, + "learning_rate": 2.7872977820388184e-06, + "loss": 0.602, + "step": 7582 + }, + { + "epoch": 2.79928624169333, + "grad_norm": 0.8913819789886475, + "learning_rate": 2.7868146240257255e-06, + "loss": 0.6113, + "step": 7583 + }, + { + "epoch": 2.799655427024366, + "grad_norm": 0.8979589939117432, + "learning_rate": 2.78633145515679e-06, + "loss": 0.5926, + "step": 7584 + }, + { + "epoch": 2.8000246123554025, + "grad_norm": 0.8841614127159119, + "learning_rate": 2.7858482754502988e-06, + "loss": 0.637, + "step": 7585 + }, + { + "epoch": 2.8003937976864384, + "grad_norm": 0.8989567756652832, + "learning_rate": 2.7853650849245407e-06, + "loss": 0.6047, + "step": 7586 + }, + { + "epoch": 2.8007629830174747, + "grad_norm": 0.8650858998298645, + "learning_rate": 2.7848818835978043e-06, + "loss": 0.5722, + "step": 7587 + }, + { + "epoch": 2.801132168348511, + "grad_norm": 0.8624334931373596, + "learning_rate": 2.7843986714883788e-06, + "loss": 0.5845, + "step": 7588 + }, + { + "epoch": 2.801501353679547, + "grad_norm": 0.8531935811042786, + "learning_rate": 2.783915448614552e-06, + "loss": 0.5822, + "step": 7589 + }, + { + "epoch": 2.8018705390105834, + "grad_norm": 0.9423304796218872, + "learning_rate": 2.7834322149946162e-06, + "loss": 0.6385, + "step": 7590 + }, + { + "epoch": 2.8022397243416197, + "grad_norm": 0.8880323767662048, + "learning_rate": 2.7829489706468605e-06, + "loss": 0.6279, + "step": 7591 + }, + { + "epoch": 2.8026089096726556, + "grad_norm": 0.883964478969574, + "learning_rate": 2.7824657155895763e-06, + "loss": 0.5968, + "step": 7592 + }, + { + "epoch": 2.802978095003692, + "grad_norm": 0.9037498235702515, + "learning_rate": 2.7819824498410524e-06, + "loss": 0.6194, + "step": 7593 + }, + { + "epoch": 2.803347280334728, + "grad_norm": 0.8742309212684631, + "learning_rate": 2.7814991734195823e-06, + "loss": 0.6247, + "step": 7594 + }, + { + "epoch": 2.8037164656657643, + "grad_norm": 0.9942402243614197, + "learning_rate": 2.7810158863434574e-06, + "loss": 0.5997, + "step": 7595 + }, + { + "epoch": 2.8040856509968, + "grad_norm": 0.8567987084388733, + "learning_rate": 2.7805325886309705e-06, + "loss": 0.6014, + "step": 7596 + }, + { + "epoch": 2.8044548363278365, + "grad_norm": 0.8575032949447632, + "learning_rate": 2.7800492803004126e-06, + "loss": 0.6231, + "step": 7597 + }, + { + "epoch": 2.804824021658873, + "grad_norm": 0.9174161553382874, + "learning_rate": 2.779565961370078e-06, + "loss": 0.6238, + "step": 7598 + }, + { + "epoch": 2.805193206989909, + "grad_norm": 0.8928158283233643, + "learning_rate": 2.7790826318582604e-06, + "loss": 0.659, + "step": 7599 + }, + { + "epoch": 2.805562392320945, + "grad_norm": 0.9138851165771484, + "learning_rate": 2.778599291783254e-06, + "loss": 0.6119, + "step": 7600 + }, + { + "epoch": 2.8059315776519815, + "grad_norm": 0.8996960520744324, + "learning_rate": 2.7781159411633506e-06, + "loss": 0.6223, + "step": 7601 + }, + { + "epoch": 2.8063007629830174, + "grad_norm": 0.8720252513885498, + "learning_rate": 2.7776325800168487e-06, + "loss": 0.5906, + "step": 7602 + }, + { + "epoch": 2.806669948314054, + "grad_norm": 0.8953214287757874, + "learning_rate": 2.7771492083620395e-06, + "loss": 0.6273, + "step": 7603 + }, + { + "epoch": 2.8070391336450897, + "grad_norm": 0.9185512065887451, + "learning_rate": 2.7766658262172207e-06, + "loss": 0.6051, + "step": 7604 + }, + { + "epoch": 2.807408318976126, + "grad_norm": 0.9350767731666565, + "learning_rate": 2.7761824336006877e-06, + "loss": 0.5956, + "step": 7605 + }, + { + "epoch": 2.807777504307162, + "grad_norm": 0.918003261089325, + "learning_rate": 2.7756990305307373e-06, + "loss": 0.6212, + "step": 7606 + }, + { + "epoch": 2.8081466896381984, + "grad_norm": 0.8749215006828308, + "learning_rate": 2.775215617025665e-06, + "loss": 0.6022, + "step": 7607 + }, + { + "epoch": 2.8085158749692347, + "grad_norm": 0.8879005312919617, + "learning_rate": 2.7747321931037684e-06, + "loss": 0.596, + "step": 7608 + }, + { + "epoch": 2.8088850603002706, + "grad_norm": 0.8707795739173889, + "learning_rate": 2.774248758783345e-06, + "loss": 0.6129, + "step": 7609 + }, + { + "epoch": 2.809254245631307, + "grad_norm": 0.8886193633079529, + "learning_rate": 2.773765314082693e-06, + "loss": 0.6225, + "step": 7610 + }, + { + "epoch": 2.8096234309623433, + "grad_norm": 0.8934370875358582, + "learning_rate": 2.7732818590201104e-06, + "loss": 0.6271, + "step": 7611 + }, + { + "epoch": 2.8099926162933793, + "grad_norm": 0.9063937067985535, + "learning_rate": 2.772798393613895e-06, + "loss": 0.6247, + "step": 7612 + }, + { + "epoch": 2.8103618016244156, + "grad_norm": 0.9182116389274597, + "learning_rate": 2.7723149178823474e-06, + "loss": 0.6135, + "step": 7613 + }, + { + "epoch": 2.8107309869554515, + "grad_norm": 0.8908085823059082, + "learning_rate": 2.7718314318437656e-06, + "loss": 0.6001, + "step": 7614 + }, + { + "epoch": 2.811100172286488, + "grad_norm": 0.8718469142913818, + "learning_rate": 2.7713479355164495e-06, + "loss": 0.609, + "step": 7615 + }, + { + "epoch": 2.811469357617524, + "grad_norm": 0.8989517688751221, + "learning_rate": 2.7708644289187e-06, + "loss": 0.6004, + "step": 7616 + }, + { + "epoch": 2.81183854294856, + "grad_norm": 0.8900275826454163, + "learning_rate": 2.770380912068818e-06, + "loss": 0.6292, + "step": 7617 + }, + { + "epoch": 2.8122077282795965, + "grad_norm": 0.8962679505348206, + "learning_rate": 2.769897384985103e-06, + "loss": 0.6244, + "step": 7618 + }, + { + "epoch": 2.8125769136106324, + "grad_norm": 0.9222976565361023, + "learning_rate": 2.7694138476858575e-06, + "loss": 0.6256, + "step": 7619 + }, + { + "epoch": 2.812946098941669, + "grad_norm": 0.9091322422027588, + "learning_rate": 2.768930300189383e-06, + "loss": 0.6126, + "step": 7620 + }, + { + "epoch": 2.813315284272705, + "grad_norm": 0.908890426158905, + "learning_rate": 2.768446742513981e-06, + "loss": 0.6353, + "step": 7621 + }, + { + "epoch": 2.813684469603741, + "grad_norm": 0.9071202874183655, + "learning_rate": 2.7679631746779546e-06, + "loss": 0.59, + "step": 7622 + }, + { + "epoch": 2.814053654934777, + "grad_norm": 0.9203635454177856, + "learning_rate": 2.767479596699607e-06, + "loss": 0.6384, + "step": 7623 + }, + { + "epoch": 2.8144228402658134, + "grad_norm": 0.8855568170547485, + "learning_rate": 2.7669960085972407e-06, + "loss": 0.6281, + "step": 7624 + }, + { + "epoch": 2.8147920255968497, + "grad_norm": 0.9192225933074951, + "learning_rate": 2.76651241038916e-06, + "loss": 0.6204, + "step": 7625 + }, + { + "epoch": 2.8151612109278856, + "grad_norm": 0.9035342931747437, + "learning_rate": 2.7660288020936695e-06, + "loss": 0.6073, + "step": 7626 + }, + { + "epoch": 2.815530396258922, + "grad_norm": 0.9116398096084595, + "learning_rate": 2.7655451837290715e-06, + "loss": 0.6313, + "step": 7627 + }, + { + "epoch": 2.8158995815899583, + "grad_norm": 0.8919509053230286, + "learning_rate": 2.7650615553136724e-06, + "loss": 0.6181, + "step": 7628 + }, + { + "epoch": 2.8162687669209943, + "grad_norm": 0.9002915024757385, + "learning_rate": 2.764577916865777e-06, + "loss": 0.6001, + "step": 7629 + }, + { + "epoch": 2.8166379522520306, + "grad_norm": 0.8632021546363831, + "learning_rate": 2.7640942684036916e-06, + "loss": 0.5979, + "step": 7630 + }, + { + "epoch": 2.8170071375830665, + "grad_norm": 0.8881890773773193, + "learning_rate": 2.763610609945721e-06, + "loss": 0.6314, + "step": 7631 + }, + { + "epoch": 2.817376322914103, + "grad_norm": 0.8964690566062927, + "learning_rate": 2.763126941510172e-06, + "loss": 0.6003, + "step": 7632 + }, + { + "epoch": 2.817745508245139, + "grad_norm": 0.8653854727745056, + "learning_rate": 2.7626432631153514e-06, + "loss": 0.5758, + "step": 7633 + }, + { + "epoch": 2.818114693576175, + "grad_norm": 0.8883273005485535, + "learning_rate": 2.762159574779566e-06, + "loss": 0.5996, + "step": 7634 + }, + { + "epoch": 2.8184838789072115, + "grad_norm": 0.9320270419120789, + "learning_rate": 2.761675876521124e-06, + "loss": 0.6324, + "step": 7635 + }, + { + "epoch": 2.8188530642382474, + "grad_norm": 0.8762413859367371, + "learning_rate": 2.7611921683583317e-06, + "loss": 0.5742, + "step": 7636 + }, + { + "epoch": 2.819222249569284, + "grad_norm": 0.8855172395706177, + "learning_rate": 2.7607084503094995e-06, + "loss": 0.5754, + "step": 7637 + }, + { + "epoch": 2.81959143490032, + "grad_norm": 0.8595521450042725, + "learning_rate": 2.7602247223929335e-06, + "loss": 0.5768, + "step": 7638 + }, + { + "epoch": 2.819960620231356, + "grad_norm": 0.9262213110923767, + "learning_rate": 2.7597409846269446e-06, + "loss": 0.6333, + "step": 7639 + }, + { + "epoch": 2.8203298055623924, + "grad_norm": 0.9470617771148682, + "learning_rate": 2.7592572370298416e-06, + "loss": 0.6156, + "step": 7640 + }, + { + "epoch": 2.8206989908934283, + "grad_norm": 0.9146953821182251, + "learning_rate": 2.7587734796199335e-06, + "loss": 0.5793, + "step": 7641 + }, + { + "epoch": 2.8210681762244647, + "grad_norm": 0.9416590332984924, + "learning_rate": 2.7582897124155305e-06, + "loss": 0.6622, + "step": 7642 + }, + { + "epoch": 2.8214373615555006, + "grad_norm": 0.8838027715682983, + "learning_rate": 2.757805935434944e-06, + "loss": 0.6168, + "step": 7643 + }, + { + "epoch": 2.821806546886537, + "grad_norm": 0.896210253238678, + "learning_rate": 2.7573221486964844e-06, + "loss": 0.5957, + "step": 7644 + }, + { + "epoch": 2.8221757322175733, + "grad_norm": 0.8879446387290955, + "learning_rate": 2.7568383522184623e-06, + "loss": 0.6383, + "step": 7645 + }, + { + "epoch": 2.8225449175486093, + "grad_norm": 3.122713804244995, + "learning_rate": 2.75635454601919e-06, + "loss": 0.6213, + "step": 7646 + }, + { + "epoch": 2.8229141028796456, + "grad_norm": 0.8824042677879333, + "learning_rate": 2.755870730116979e-06, + "loss": 0.5911, + "step": 7647 + }, + { + "epoch": 2.823283288210682, + "grad_norm": 0.8882304430007935, + "learning_rate": 2.7553869045301412e-06, + "loss": 0.6287, + "step": 7648 + }, + { + "epoch": 2.823652473541718, + "grad_norm": 0.909805417060852, + "learning_rate": 2.7549030692769896e-06, + "loss": 0.5715, + "step": 7649 + }, + { + "epoch": 2.8240216588727542, + "grad_norm": 0.8620378971099854, + "learning_rate": 2.754419224375837e-06, + "loss": 0.592, + "step": 7650 + }, + { + "epoch": 2.82439084420379, + "grad_norm": 0.9115160703659058, + "learning_rate": 2.7539353698449983e-06, + "loss": 0.6275, + "step": 7651 + }, + { + "epoch": 2.8247600295348265, + "grad_norm": 0.8840954303741455, + "learning_rate": 2.753451505702785e-06, + "loss": 0.6191, + "step": 7652 + }, + { + "epoch": 2.8251292148658624, + "grad_norm": 0.8774763941764832, + "learning_rate": 2.7529676319675125e-06, + "loss": 0.6243, + "step": 7653 + }, + { + "epoch": 2.825498400196899, + "grad_norm": 0.9129458069801331, + "learning_rate": 2.752483748657495e-06, + "loss": 0.6098, + "step": 7654 + }, + { + "epoch": 2.825867585527935, + "grad_norm": 0.8701092004776001, + "learning_rate": 2.7519998557910475e-06, + "loss": 0.6222, + "step": 7655 + }, + { + "epoch": 2.826236770858971, + "grad_norm": 0.9313108921051025, + "learning_rate": 2.751515953386484e-06, + "loss": 0.6167, + "step": 7656 + }, + { + "epoch": 2.8266059561900074, + "grad_norm": 0.9017849564552307, + "learning_rate": 2.7510320414621226e-06, + "loss": 0.5879, + "step": 7657 + }, + { + "epoch": 2.826975141521044, + "grad_norm": 0.8806102871894836, + "learning_rate": 2.7505481200362765e-06, + "loss": 0.6027, + "step": 7658 + }, + { + "epoch": 2.8273443268520797, + "grad_norm": 0.8647875785827637, + "learning_rate": 2.750064189127264e-06, + "loss": 0.6067, + "step": 7659 + }, + { + "epoch": 2.827713512183116, + "grad_norm": 0.882455050945282, + "learning_rate": 2.7495802487533996e-06, + "loss": 0.5816, + "step": 7660 + }, + { + "epoch": 2.828082697514152, + "grad_norm": 0.8713465332984924, + "learning_rate": 2.749096298933003e-06, + "loss": 0.612, + "step": 7661 + }, + { + "epoch": 2.8284518828451883, + "grad_norm": 0.9121603965759277, + "learning_rate": 2.748612339684389e-06, + "loss": 0.6115, + "step": 7662 + }, + { + "epoch": 2.8288210681762243, + "grad_norm": 0.8633906841278076, + "learning_rate": 2.7481283710258765e-06, + "loss": 0.6015, + "step": 7663 + }, + { + "epoch": 2.8291902535072606, + "grad_norm": 0.9112171530723572, + "learning_rate": 2.7476443929757848e-06, + "loss": 0.5788, + "step": 7664 + }, + { + "epoch": 2.829559438838297, + "grad_norm": 0.8784250617027283, + "learning_rate": 2.7471604055524294e-06, + "loss": 0.6329, + "step": 7665 + }, + { + "epoch": 2.829928624169333, + "grad_norm": 0.9257387518882751, + "learning_rate": 2.746676408774131e-06, + "loss": 0.617, + "step": 7666 + }, + { + "epoch": 2.8302978095003692, + "grad_norm": 0.8989806175231934, + "learning_rate": 2.746192402659209e-06, + "loss": 0.5927, + "step": 7667 + }, + { + "epoch": 2.8306669948314056, + "grad_norm": 0.92884761095047, + "learning_rate": 2.7457083872259815e-06, + "loss": 0.6009, + "step": 7668 + }, + { + "epoch": 2.8310361801624415, + "grad_norm": 0.9079515337944031, + "learning_rate": 2.7452243624927695e-06, + "loss": 0.6313, + "step": 7669 + }, + { + "epoch": 2.831405365493478, + "grad_norm": 0.9275363683700562, + "learning_rate": 2.744740328477893e-06, + "loss": 0.6412, + "step": 7670 + }, + { + "epoch": 2.831774550824514, + "grad_norm": 0.9020093083381653, + "learning_rate": 2.7442562851996718e-06, + "loss": 0.5954, + "step": 7671 + }, + { + "epoch": 2.83214373615555, + "grad_norm": 0.9329630136489868, + "learning_rate": 2.7437722326764278e-06, + "loss": 0.6488, + "step": 7672 + }, + { + "epoch": 2.832512921486586, + "grad_norm": 0.8908448219299316, + "learning_rate": 2.7432881709264814e-06, + "loss": 0.578, + "step": 7673 + }, + { + "epoch": 2.8328821068176224, + "grad_norm": 0.8602346181869507, + "learning_rate": 2.7428040999681544e-06, + "loss": 0.572, + "step": 7674 + }, + { + "epoch": 2.833251292148659, + "grad_norm": 0.8731236457824707, + "learning_rate": 2.742320019819769e-06, + "loss": 0.594, + "step": 7675 + }, + { + "epoch": 2.8336204774796947, + "grad_norm": 0.9217734932899475, + "learning_rate": 2.7418359304996472e-06, + "loss": 0.6352, + "step": 7676 + }, + { + "epoch": 2.833989662810731, + "grad_norm": 0.8894678950309753, + "learning_rate": 2.7413518320261122e-06, + "loss": 0.6173, + "step": 7677 + }, + { + "epoch": 2.8343588481417674, + "grad_norm": 0.9051456451416016, + "learning_rate": 2.7408677244174866e-06, + "loss": 0.5846, + "step": 7678 + }, + { + "epoch": 2.8347280334728033, + "grad_norm": 0.8964061737060547, + "learning_rate": 2.7403836076920924e-06, + "loss": 0.6011, + "step": 7679 + }, + { + "epoch": 2.8350972188038397, + "grad_norm": 0.9063067436218262, + "learning_rate": 2.7398994818682557e-06, + "loss": 0.6258, + "step": 7680 + }, + { + "epoch": 2.8354664041348756, + "grad_norm": 0.931656539440155, + "learning_rate": 2.739415346964299e-06, + "loss": 0.6454, + "step": 7681 + }, + { + "epoch": 2.835835589465912, + "grad_norm": 0.9002521634101868, + "learning_rate": 2.7389312029985473e-06, + "loss": 0.628, + "step": 7682 + }, + { + "epoch": 2.836204774796948, + "grad_norm": 0.8884948492050171, + "learning_rate": 2.738447049989324e-06, + "loss": 0.6039, + "step": 7683 + }, + { + "epoch": 2.8365739601279842, + "grad_norm": 0.8972615599632263, + "learning_rate": 2.737962887954956e-06, + "loss": 0.6202, + "step": 7684 + }, + { + "epoch": 2.8369431454590206, + "grad_norm": 0.873630940914154, + "learning_rate": 2.737478716913768e-06, + "loss": 0.5882, + "step": 7685 + }, + { + "epoch": 2.8373123307900565, + "grad_norm": 0.9009674787521362, + "learning_rate": 2.7369945368840846e-06, + "loss": 0.651, + "step": 7686 + }, + { + "epoch": 2.837681516121093, + "grad_norm": 0.8988712430000305, + "learning_rate": 2.7365103478842337e-06, + "loss": 0.6417, + "step": 7687 + }, + { + "epoch": 2.8380507014521292, + "grad_norm": 0.9062166213989258, + "learning_rate": 2.7360261499325397e-06, + "loss": 0.6288, + "step": 7688 + }, + { + "epoch": 2.838419886783165, + "grad_norm": 0.883965015411377, + "learning_rate": 2.735541943047331e-06, + "loss": 0.612, + "step": 7689 + }, + { + "epoch": 2.838789072114201, + "grad_norm": 0.9014157652854919, + "learning_rate": 2.735057727246934e-06, + "loss": 0.6167, + "step": 7690 + }, + { + "epoch": 2.8391582574452374, + "grad_norm": 0.8808509707450867, + "learning_rate": 2.7345735025496772e-06, + "loss": 0.6235, + "step": 7691 + }, + { + "epoch": 2.839527442776274, + "grad_norm": 0.9073087573051453, + "learning_rate": 2.734089268973886e-06, + "loss": 0.6259, + "step": 7692 + }, + { + "epoch": 2.8398966281073097, + "grad_norm": 0.8964331150054932, + "learning_rate": 2.7336050265378907e-06, + "loss": 0.6077, + "step": 7693 + }, + { + "epoch": 2.840265813438346, + "grad_norm": 0.8967471718788147, + "learning_rate": 2.733120775260018e-06, + "loss": 0.6249, + "step": 7694 + }, + { + "epoch": 2.8406349987693824, + "grad_norm": 0.8892977237701416, + "learning_rate": 2.7326365151585988e-06, + "loss": 0.5953, + "step": 7695 + }, + { + "epoch": 2.8410041841004183, + "grad_norm": 0.9408417344093323, + "learning_rate": 2.73215224625196e-06, + "loss": 0.6049, + "step": 7696 + }, + { + "epoch": 2.8413733694314547, + "grad_norm": 0.8860653638839722, + "learning_rate": 2.7316679685584326e-06, + "loss": 0.6193, + "step": 7697 + }, + { + "epoch": 2.841742554762491, + "grad_norm": 0.8985869288444519, + "learning_rate": 2.7311836820963446e-06, + "loss": 0.6319, + "step": 7698 + }, + { + "epoch": 2.842111740093527, + "grad_norm": 0.8873679041862488, + "learning_rate": 2.730699386884028e-06, + "loss": 0.6433, + "step": 7699 + }, + { + "epoch": 2.842480925424563, + "grad_norm": 0.8874045014381409, + "learning_rate": 2.7302150829398123e-06, + "loss": 0.5768, + "step": 7700 + }, + { + "epoch": 2.8428501107555992, + "grad_norm": 0.8672353029251099, + "learning_rate": 2.7297307702820284e-06, + "loss": 0.6006, + "step": 7701 + }, + { + "epoch": 2.8432192960866356, + "grad_norm": 0.8923237919807434, + "learning_rate": 2.7292464489290067e-06, + "loss": 0.5822, + "step": 7702 + }, + { + "epoch": 2.8435884814176715, + "grad_norm": 0.90580153465271, + "learning_rate": 2.7287621188990798e-06, + "loss": 0.6446, + "step": 7703 + }, + { + "epoch": 2.843957666748708, + "grad_norm": 0.8656916618347168, + "learning_rate": 2.7282777802105792e-06, + "loss": 0.5863, + "step": 7704 + }, + { + "epoch": 2.8443268520797442, + "grad_norm": 0.8920660018920898, + "learning_rate": 2.7277934328818365e-06, + "loss": 0.6118, + "step": 7705 + }, + { + "epoch": 2.84469603741078, + "grad_norm": 0.8568210601806641, + "learning_rate": 2.727309076931184e-06, + "loss": 0.5658, + "step": 7706 + }, + { + "epoch": 2.8450652227418165, + "grad_norm": 0.8738588094711304, + "learning_rate": 2.726824712376955e-06, + "loss": 0.5839, + "step": 7707 + }, + { + "epoch": 2.8454344080728524, + "grad_norm": 0.8981674909591675, + "learning_rate": 2.7263403392374817e-06, + "loss": 0.6165, + "step": 7708 + }, + { + "epoch": 2.8458035934038888, + "grad_norm": 0.920540988445282, + "learning_rate": 2.725855957531099e-06, + "loss": 0.6164, + "step": 7709 + }, + { + "epoch": 2.8461727787349247, + "grad_norm": 0.882618248462677, + "learning_rate": 2.7253715672761383e-06, + "loss": 0.5801, + "step": 7710 + }, + { + "epoch": 2.846541964065961, + "grad_norm": 0.8693382740020752, + "learning_rate": 2.724887168490936e-06, + "loss": 0.5806, + "step": 7711 + }, + { + "epoch": 2.8469111493969974, + "grad_norm": 0.8837906718254089, + "learning_rate": 2.7244027611938247e-06, + "loss": 0.6073, + "step": 7712 + }, + { + "epoch": 2.8472803347280333, + "grad_norm": 0.8770569562911987, + "learning_rate": 2.7239183454031396e-06, + "loss": 0.5972, + "step": 7713 + }, + { + "epoch": 2.8476495200590697, + "grad_norm": 0.8746588826179504, + "learning_rate": 2.723433921137216e-06, + "loss": 0.6086, + "step": 7714 + }, + { + "epoch": 2.848018705390106, + "grad_norm": 0.937780499458313, + "learning_rate": 2.72294948841439e-06, + "loss": 0.6458, + "step": 7715 + }, + { + "epoch": 2.848387890721142, + "grad_norm": 0.9489485621452332, + "learning_rate": 2.722465047252995e-06, + "loss": 0.5928, + "step": 7716 + }, + { + "epoch": 2.8487570760521783, + "grad_norm": 0.8760091662406921, + "learning_rate": 2.7219805976713682e-06, + "loss": 0.6352, + "step": 7717 + }, + { + "epoch": 2.8491262613832142, + "grad_norm": 0.9101678133010864, + "learning_rate": 2.721496139687846e-06, + "loss": 0.6478, + "step": 7718 + }, + { + "epoch": 2.8494954467142506, + "grad_norm": 0.8713541626930237, + "learning_rate": 2.721011673320765e-06, + "loss": 0.5821, + "step": 7719 + }, + { + "epoch": 2.8498646320452865, + "grad_norm": 0.8787331581115723, + "learning_rate": 2.720527198588462e-06, + "loss": 0.604, + "step": 7720 + }, + { + "epoch": 2.850233817376323, + "grad_norm": 0.8659200668334961, + "learning_rate": 2.720042715509274e-06, + "loss": 0.62, + "step": 7721 + }, + { + "epoch": 2.8506030027073592, + "grad_norm": 0.8772737979888916, + "learning_rate": 2.7195582241015383e-06, + "loss": 0.6239, + "step": 7722 + }, + { + "epoch": 2.850972188038395, + "grad_norm": 0.8914541006088257, + "learning_rate": 2.719073724383594e-06, + "loss": 0.6398, + "step": 7723 + }, + { + "epoch": 2.8513413733694315, + "grad_norm": 0.8828993439674377, + "learning_rate": 2.718589216373778e-06, + "loss": 0.6269, + "step": 7724 + }, + { + "epoch": 2.851710558700468, + "grad_norm": 0.8920882344245911, + "learning_rate": 2.718104700090429e-06, + "loss": 0.5882, + "step": 7725 + }, + { + "epoch": 2.8520797440315038, + "grad_norm": 0.8862192630767822, + "learning_rate": 2.717620175551885e-06, + "loss": 0.6058, + "step": 7726 + }, + { + "epoch": 2.85244892936254, + "grad_norm": 0.8969277739524841, + "learning_rate": 2.717135642776488e-06, + "loss": 0.6026, + "step": 7727 + }, + { + "epoch": 2.852818114693576, + "grad_norm": 0.8932495713233948, + "learning_rate": 2.7166511017825735e-06, + "loss": 0.6127, + "step": 7728 + }, + { + "epoch": 2.8531873000246124, + "grad_norm": 0.9214999675750732, + "learning_rate": 2.716166552588485e-06, + "loss": 0.631, + "step": 7729 + }, + { + "epoch": 2.8535564853556483, + "grad_norm": 0.9127171039581299, + "learning_rate": 2.7156819952125596e-06, + "loss": 0.6206, + "step": 7730 + }, + { + "epoch": 2.8539256706866847, + "grad_norm": 0.9400171637535095, + "learning_rate": 2.7151974296731385e-06, + "loss": 0.6356, + "step": 7731 + }, + { + "epoch": 2.854294856017721, + "grad_norm": 0.8888329863548279, + "learning_rate": 2.7147128559885634e-06, + "loss": 0.6068, + "step": 7732 + }, + { + "epoch": 2.854664041348757, + "grad_norm": 0.888949990272522, + "learning_rate": 2.7142282741771744e-06, + "loss": 0.621, + "step": 7733 + }, + { + "epoch": 2.8550332266797933, + "grad_norm": 0.8823342323303223, + "learning_rate": 2.713743684257312e-06, + "loss": 0.5894, + "step": 7734 + }, + { + "epoch": 2.8554024120108297, + "grad_norm": 0.8870218396186829, + "learning_rate": 2.7132590862473196e-06, + "loss": 0.5986, + "step": 7735 + }, + { + "epoch": 2.8557715973418656, + "grad_norm": 0.8654966950416565, + "learning_rate": 2.712774480165537e-06, + "loss": 0.6051, + "step": 7736 + }, + { + "epoch": 2.856140782672902, + "grad_norm": 0.8773070573806763, + "learning_rate": 2.7122898660303087e-06, + "loss": 0.628, + "step": 7737 + }, + { + "epoch": 2.856509968003938, + "grad_norm": 0.9060022234916687, + "learning_rate": 2.7118052438599753e-06, + "loss": 0.6132, + "step": 7738 + }, + { + "epoch": 2.856879153334974, + "grad_norm": 0.9025731682777405, + "learning_rate": 2.7113206136728803e-06, + "loss": 0.6323, + "step": 7739 + }, + { + "epoch": 2.85724833866601, + "grad_norm": 0.9022579193115234, + "learning_rate": 2.7108359754873664e-06, + "loss": 0.6184, + "step": 7740 + }, + { + "epoch": 2.8576175239970465, + "grad_norm": 0.9257970452308655, + "learning_rate": 2.710351329321778e-06, + "loss": 0.6091, + "step": 7741 + }, + { + "epoch": 2.857986709328083, + "grad_norm": 0.9300596714019775, + "learning_rate": 2.709866675194458e-06, + "loss": 0.6059, + "step": 7742 + }, + { + "epoch": 2.8583558946591188, + "grad_norm": 0.897185742855072, + "learning_rate": 2.709382013123751e-06, + "loss": 0.5921, + "step": 7743 + }, + { + "epoch": 2.858725079990155, + "grad_norm": 0.8772724270820618, + "learning_rate": 2.708897343128e-06, + "loss": 0.6314, + "step": 7744 + }, + { + "epoch": 2.8590942653211915, + "grad_norm": 0.8623315095901489, + "learning_rate": 2.708412665225551e-06, + "loss": 0.611, + "step": 7745 + }, + { + "epoch": 2.8594634506522274, + "grad_norm": 0.9229100346565247, + "learning_rate": 2.707927979434748e-06, + "loss": 0.6247, + "step": 7746 + }, + { + "epoch": 2.8598326359832638, + "grad_norm": 0.9044189453125, + "learning_rate": 2.7074432857739373e-06, + "loss": 0.6144, + "step": 7747 + }, + { + "epoch": 2.8602018213142997, + "grad_norm": 0.8536921739578247, + "learning_rate": 2.706958584261463e-06, + "loss": 0.5943, + "step": 7748 + }, + { + "epoch": 2.860571006645336, + "grad_norm": 0.9018460512161255, + "learning_rate": 2.706473874915673e-06, + "loss": 0.6184, + "step": 7749 + }, + { + "epoch": 2.860940191976372, + "grad_norm": 0.8894857168197632, + "learning_rate": 2.70598915775491e-06, + "loss": 0.5925, + "step": 7750 + }, + { + "epoch": 2.8613093773074083, + "grad_norm": 0.8736217021942139, + "learning_rate": 2.705504432797524e-06, + "loss": 0.6253, + "step": 7751 + }, + { + "epoch": 2.8616785626384447, + "grad_norm": 0.9163395762443542, + "learning_rate": 2.705019700061859e-06, + "loss": 0.6238, + "step": 7752 + }, + { + "epoch": 2.8620477479694806, + "grad_norm": 0.8874900341033936, + "learning_rate": 2.704534959566264e-06, + "loss": 0.6029, + "step": 7753 + }, + { + "epoch": 2.862416933300517, + "grad_norm": 0.9171255826950073, + "learning_rate": 2.7040502113290846e-06, + "loss": 0.6344, + "step": 7754 + }, + { + "epoch": 2.8627861186315533, + "grad_norm": 0.8990896940231323, + "learning_rate": 2.703565455368669e-06, + "loss": 0.6536, + "step": 7755 + }, + { + "epoch": 2.863155303962589, + "grad_norm": 0.9028767347335815, + "learning_rate": 2.703080691703365e-06, + "loss": 0.6586, + "step": 7756 + }, + { + "epoch": 2.863524489293625, + "grad_norm": 0.9163032174110413, + "learning_rate": 2.7025959203515223e-06, + "loss": 0.62, + "step": 7757 + }, + { + "epoch": 2.8638936746246615, + "grad_norm": 0.9026995897293091, + "learning_rate": 2.7021111413314863e-06, + "loss": 0.598, + "step": 7758 + }, + { + "epoch": 2.864262859955698, + "grad_norm": 0.9078750014305115, + "learning_rate": 2.7016263546616083e-06, + "loss": 0.642, + "step": 7759 + }, + { + "epoch": 2.8646320452867338, + "grad_norm": 0.885564386844635, + "learning_rate": 2.7011415603602358e-06, + "loss": 0.6002, + "step": 7760 + }, + { + "epoch": 2.86500123061777, + "grad_norm": 0.9042371511459351, + "learning_rate": 2.70065675844572e-06, + "loss": 0.5915, + "step": 7761 + }, + { + "epoch": 2.8653704159488065, + "grad_norm": 0.8925498723983765, + "learning_rate": 2.700171948936408e-06, + "loss": 0.6003, + "step": 7762 + }, + { + "epoch": 2.8657396012798424, + "grad_norm": 0.8893204927444458, + "learning_rate": 2.6996871318506517e-06, + "loss": 0.6299, + "step": 7763 + }, + { + "epoch": 2.8661087866108788, + "grad_norm": 0.8745389580726624, + "learning_rate": 2.699202307206799e-06, + "loss": 0.5807, + "step": 7764 + }, + { + "epoch": 2.866477971941915, + "grad_norm": 0.8810576796531677, + "learning_rate": 2.6987174750232037e-06, + "loss": 0.6089, + "step": 7765 + }, + { + "epoch": 2.866847157272951, + "grad_norm": 0.8945237994194031, + "learning_rate": 2.6982326353182135e-06, + "loss": 0.6292, + "step": 7766 + }, + { + "epoch": 2.867216342603987, + "grad_norm": 0.8997896313667297, + "learning_rate": 2.6977477881101814e-06, + "loss": 0.6034, + "step": 7767 + }, + { + "epoch": 2.8675855279350233, + "grad_norm": 0.888792097568512, + "learning_rate": 2.6972629334174578e-06, + "loss": 0.6126, + "step": 7768 + }, + { + "epoch": 2.8679547132660597, + "grad_norm": 0.8855166435241699, + "learning_rate": 2.6967780712583937e-06, + "loss": 0.6287, + "step": 7769 + }, + { + "epoch": 2.8683238985970956, + "grad_norm": 0.9278580546379089, + "learning_rate": 2.6962932016513422e-06, + "loss": 0.6284, + "step": 7770 + }, + { + "epoch": 2.868693083928132, + "grad_norm": 0.8565061688423157, + "learning_rate": 2.6958083246146548e-06, + "loss": 0.5433, + "step": 7771 + }, + { + "epoch": 2.8690622692591683, + "grad_norm": 0.8724117279052734, + "learning_rate": 2.6953234401666844e-06, + "loss": 0.5859, + "step": 7772 + }, + { + "epoch": 2.869431454590204, + "grad_norm": 0.8933340907096863, + "learning_rate": 2.6948385483257838e-06, + "loss": 0.6075, + "step": 7773 + }, + { + "epoch": 2.8698006399212406, + "grad_norm": 0.9537880420684814, + "learning_rate": 2.694353649110304e-06, + "loss": 0.6113, + "step": 7774 + }, + { + "epoch": 2.8701698252522765, + "grad_norm": 0.8856058716773987, + "learning_rate": 2.6938687425386016e-06, + "loss": 0.6044, + "step": 7775 + }, + { + "epoch": 2.870539010583313, + "grad_norm": 0.9253922700881958, + "learning_rate": 2.6933838286290275e-06, + "loss": 0.6348, + "step": 7776 + }, + { + "epoch": 2.8709081959143488, + "grad_norm": 0.9018125534057617, + "learning_rate": 2.6928989073999377e-06, + "loss": 0.6088, + "step": 7777 + }, + { + "epoch": 2.871277381245385, + "grad_norm": 0.8924092054367065, + "learning_rate": 2.6924139788696834e-06, + "loss": 0.6141, + "step": 7778 + }, + { + "epoch": 2.8716465665764215, + "grad_norm": 0.898703932762146, + "learning_rate": 2.6919290430566225e-06, + "loss": 0.6031, + "step": 7779 + }, + { + "epoch": 2.8720157519074574, + "grad_norm": 0.8818811178207397, + "learning_rate": 2.6914440999791067e-06, + "loss": 0.623, + "step": 7780 + }, + { + "epoch": 2.8723849372384938, + "grad_norm": 0.9209259748458862, + "learning_rate": 2.690959149655493e-06, + "loss": 0.6301, + "step": 7781 + }, + { + "epoch": 2.87275412256953, + "grad_norm": 0.91340571641922, + "learning_rate": 2.690474192104135e-06, + "loss": 0.6172, + "step": 7782 + }, + { + "epoch": 2.873123307900566, + "grad_norm": 0.8887362480163574, + "learning_rate": 2.6899892273433893e-06, + "loss": 0.6272, + "step": 7783 + }, + { + "epoch": 2.8734924932316024, + "grad_norm": 0.9146367311477661, + "learning_rate": 2.6895042553916106e-06, + "loss": 0.6522, + "step": 7784 + }, + { + "epoch": 2.8738616785626383, + "grad_norm": 0.8937088251113892, + "learning_rate": 2.6890192762671568e-06, + "loss": 0.6246, + "step": 7785 + }, + { + "epoch": 2.8742308638936747, + "grad_norm": 0.8940175771713257, + "learning_rate": 2.6885342899883828e-06, + "loss": 0.6047, + "step": 7786 + }, + { + "epoch": 2.8746000492247106, + "grad_norm": 0.857182502746582, + "learning_rate": 2.6880492965736455e-06, + "loss": 0.5913, + "step": 7787 + }, + { + "epoch": 2.874969234555747, + "grad_norm": 0.8934565186500549, + "learning_rate": 2.6875642960413005e-06, + "loss": 0.6172, + "step": 7788 + }, + { + "epoch": 2.8753384198867833, + "grad_norm": 0.8705382347106934, + "learning_rate": 2.6870792884097076e-06, + "loss": 0.6044, + "step": 7789 + }, + { + "epoch": 2.875707605217819, + "grad_norm": 0.8731212019920349, + "learning_rate": 2.6865942736972217e-06, + "loss": 0.5915, + "step": 7790 + }, + { + "epoch": 2.8760767905488556, + "grad_norm": 0.8870211839675903, + "learning_rate": 2.6861092519222022e-06, + "loss": 0.6141, + "step": 7791 + }, + { + "epoch": 2.876445975879892, + "grad_norm": 0.922410786151886, + "learning_rate": 2.6856242231030048e-06, + "loss": 0.6174, + "step": 7792 + }, + { + "epoch": 2.876815161210928, + "grad_norm": 0.8880417346954346, + "learning_rate": 2.68513918725799e-06, + "loss": 0.6269, + "step": 7793 + }, + { + "epoch": 2.877184346541964, + "grad_norm": 0.8847613334655762, + "learning_rate": 2.6846541444055157e-06, + "loss": 0.6099, + "step": 7794 + }, + { + "epoch": 2.877553531873, + "grad_norm": 0.9263583421707153, + "learning_rate": 2.6841690945639405e-06, + "loss": 0.6217, + "step": 7795 + }, + { + "epoch": 2.8779227172040365, + "grad_norm": 0.8943008184432983, + "learning_rate": 2.6836840377516226e-06, + "loss": 0.6084, + "step": 7796 + }, + { + "epoch": 2.8782919025350724, + "grad_norm": 0.9145259261131287, + "learning_rate": 2.6831989739869217e-06, + "loss": 0.6557, + "step": 7797 + }, + { + "epoch": 2.8786610878661087, + "grad_norm": 0.9201354384422302, + "learning_rate": 2.6827139032881978e-06, + "loss": 0.624, + "step": 7798 + }, + { + "epoch": 2.879030273197145, + "grad_norm": 0.8954059481620789, + "learning_rate": 2.682228825673811e-06, + "loss": 0.6204, + "step": 7799 + }, + { + "epoch": 2.879399458528181, + "grad_norm": 0.8804138898849487, + "learning_rate": 2.6817437411621194e-06, + "loss": 0.5737, + "step": 7800 + }, + { + "epoch": 2.8797686438592174, + "grad_norm": 0.8963635563850403, + "learning_rate": 2.681258649771486e-06, + "loss": 0.6097, + "step": 7801 + }, + { + "epoch": 2.8801378291902537, + "grad_norm": 0.8836999535560608, + "learning_rate": 2.680773551520268e-06, + "loss": 0.6206, + "step": 7802 + }, + { + "epoch": 2.8805070145212897, + "grad_norm": 0.8868880271911621, + "learning_rate": 2.68028844642683e-06, + "loss": 0.6515, + "step": 7803 + }, + { + "epoch": 2.880876199852326, + "grad_norm": 0.8881941437721252, + "learning_rate": 2.67980333450953e-06, + "loss": 0.6111, + "step": 7804 + }, + { + "epoch": 2.881245385183362, + "grad_norm": 0.8978256583213806, + "learning_rate": 2.6793182157867314e-06, + "loss": 0.5927, + "step": 7805 + }, + { + "epoch": 2.8816145705143983, + "grad_norm": 0.8685243725776672, + "learning_rate": 2.6788330902767943e-06, + "loss": 0.6037, + "step": 7806 + }, + { + "epoch": 2.881983755845434, + "grad_norm": 0.8831161260604858, + "learning_rate": 2.678347957998081e-06, + "loss": 0.5938, + "step": 7807 + }, + { + "epoch": 2.8823529411764706, + "grad_norm": 0.9104934930801392, + "learning_rate": 2.6778628189689543e-06, + "loss": 0.6253, + "step": 7808 + }, + { + "epoch": 2.882722126507507, + "grad_norm": 0.8673095107078552, + "learning_rate": 2.677377673207776e-06, + "loss": 0.6416, + "step": 7809 + }, + { + "epoch": 2.883091311838543, + "grad_norm": 0.917768657207489, + "learning_rate": 2.676892520732908e-06, + "loss": 0.6084, + "step": 7810 + }, + { + "epoch": 2.883460497169579, + "grad_norm": 0.9070786237716675, + "learning_rate": 2.676407361562715e-06, + "loss": 0.6014, + "step": 7811 + }, + { + "epoch": 2.8838296825006156, + "grad_norm": 0.9091067910194397, + "learning_rate": 2.675922195715558e-06, + "loss": 0.6278, + "step": 7812 + }, + { + "epoch": 2.8841988678316515, + "grad_norm": 0.9238804578781128, + "learning_rate": 2.6754370232098024e-06, + "loss": 0.632, + "step": 7813 + }, + { + "epoch": 2.884568053162688, + "grad_norm": 0.8959753513336182, + "learning_rate": 2.6749518440638104e-06, + "loss": 0.5881, + "step": 7814 + }, + { + "epoch": 2.8849372384937237, + "grad_norm": 0.8960715532302856, + "learning_rate": 2.6744666582959465e-06, + "loss": 0.6131, + "step": 7815 + }, + { + "epoch": 2.88530642382476, + "grad_norm": 0.8785313963890076, + "learning_rate": 2.673981465924574e-06, + "loss": 0.5673, + "step": 7816 + }, + { + "epoch": 2.885675609155796, + "grad_norm": 0.9011975526809692, + "learning_rate": 2.673496266968059e-06, + "loss": 0.6106, + "step": 7817 + }, + { + "epoch": 2.8860447944868324, + "grad_norm": 0.8775758147239685, + "learning_rate": 2.673011061444764e-06, + "loss": 0.5884, + "step": 7818 + }, + { + "epoch": 2.8864139798178687, + "grad_norm": 0.9081202745437622, + "learning_rate": 2.672525849373056e-06, + "loss": 0.6345, + "step": 7819 + }, + { + "epoch": 2.8867831651489047, + "grad_norm": 0.8955590128898621, + "learning_rate": 2.6720406307712982e-06, + "loss": 0.6333, + "step": 7820 + }, + { + "epoch": 2.887152350479941, + "grad_norm": 0.9184597134590149, + "learning_rate": 2.671555405657857e-06, + "loss": 0.6381, + "step": 7821 + }, + { + "epoch": 2.8875215358109774, + "grad_norm": 0.8848510980606079, + "learning_rate": 2.6710701740510974e-06, + "loss": 0.5961, + "step": 7822 + }, + { + "epoch": 2.8878907211420133, + "grad_norm": 0.8964948058128357, + "learning_rate": 2.6705849359693864e-06, + "loss": 0.6087, + "step": 7823 + }, + { + "epoch": 2.888259906473049, + "grad_norm": 0.8889433145523071, + "learning_rate": 2.6700996914310897e-06, + "loss": 0.6235, + "step": 7824 + }, + { + "epoch": 2.8886290918040856, + "grad_norm": 0.8950331807136536, + "learning_rate": 2.6696144404545726e-06, + "loss": 0.6164, + "step": 7825 + }, + { + "epoch": 2.888998277135122, + "grad_norm": 0.9074345231056213, + "learning_rate": 2.6691291830582023e-06, + "loss": 0.6059, + "step": 7826 + }, + { + "epoch": 2.889367462466158, + "grad_norm": 0.8868986368179321, + "learning_rate": 2.6686439192603465e-06, + "loss": 0.6205, + "step": 7827 + }, + { + "epoch": 2.889736647797194, + "grad_norm": 0.9038032293319702, + "learning_rate": 2.6681586490793708e-06, + "loss": 0.6211, + "step": 7828 + }, + { + "epoch": 2.8901058331282306, + "grad_norm": 0.8896850347518921, + "learning_rate": 2.6676733725336446e-06, + "loss": 0.6293, + "step": 7829 + }, + { + "epoch": 2.8904750184592665, + "grad_norm": 0.8956795334815979, + "learning_rate": 2.667188089641533e-06, + "loss": 0.6462, + "step": 7830 + }, + { + "epoch": 2.890844203790303, + "grad_norm": 0.9145985245704651, + "learning_rate": 2.666702800421405e-06, + "loss": 0.6308, + "step": 7831 + }, + { + "epoch": 2.891213389121339, + "grad_norm": 0.9154870510101318, + "learning_rate": 2.6662175048916296e-06, + "loss": 0.6051, + "step": 7832 + }, + { + "epoch": 2.891582574452375, + "grad_norm": 0.8842962384223938, + "learning_rate": 2.6657322030705733e-06, + "loss": 0.6475, + "step": 7833 + }, + { + "epoch": 2.891951759783411, + "grad_norm": 0.8810974955558777, + "learning_rate": 2.6652468949766057e-06, + "loss": 0.6024, + "step": 7834 + }, + { + "epoch": 2.8923209451144474, + "grad_norm": 0.90859454870224, + "learning_rate": 2.6647615806280947e-06, + "loss": 0.6322, + "step": 7835 + }, + { + "epoch": 2.8926901304454837, + "grad_norm": 0.9067285656929016, + "learning_rate": 2.6642762600434104e-06, + "loss": 0.5768, + "step": 7836 + }, + { + "epoch": 2.8930593157765196, + "grad_norm": 0.9069514274597168, + "learning_rate": 2.663790933240922e-06, + "loss": 0.6007, + "step": 7837 + }, + { + "epoch": 2.893428501107556, + "grad_norm": 0.9059893488883972, + "learning_rate": 2.6633056002389975e-06, + "loss": 0.6487, + "step": 7838 + }, + { + "epoch": 2.8937976864385924, + "grad_norm": 0.9151869416236877, + "learning_rate": 2.6628202610560082e-06, + "loss": 0.579, + "step": 7839 + }, + { + "epoch": 2.8941668717696283, + "grad_norm": 0.8895944356918335, + "learning_rate": 2.6623349157103227e-06, + "loss": 0.6022, + "step": 7840 + }, + { + "epoch": 2.8945360571006646, + "grad_norm": 0.8937503695487976, + "learning_rate": 2.6618495642203127e-06, + "loss": 0.5939, + "step": 7841 + }, + { + "epoch": 2.8949052424317006, + "grad_norm": 0.9405040740966797, + "learning_rate": 2.661364206604347e-06, + "loss": 0.6197, + "step": 7842 + }, + { + "epoch": 2.895274427762737, + "grad_norm": 0.8775489926338196, + "learning_rate": 2.660878842880798e-06, + "loss": 0.6017, + "step": 7843 + }, + { + "epoch": 2.895643613093773, + "grad_norm": 0.8915306329727173, + "learning_rate": 2.660393473068035e-06, + "loss": 0.6165, + "step": 7844 + }, + { + "epoch": 2.896012798424809, + "grad_norm": 0.9275400638580322, + "learning_rate": 2.6599080971844293e-06, + "loss": 0.614, + "step": 7845 + }, + { + "epoch": 2.8963819837558455, + "grad_norm": 0.8940399289131165, + "learning_rate": 2.6594227152483533e-06, + "loss": 0.6299, + "step": 7846 + }, + { + "epoch": 2.8967511690868815, + "grad_norm": 0.9127199649810791, + "learning_rate": 2.6589373272781775e-06, + "loss": 0.6215, + "step": 7847 + }, + { + "epoch": 2.897120354417918, + "grad_norm": 0.9030557870864868, + "learning_rate": 2.6584519332922743e-06, + "loss": 0.6098, + "step": 7848 + }, + { + "epoch": 2.897489539748954, + "grad_norm": 0.8976365923881531, + "learning_rate": 2.6579665333090152e-06, + "loss": 0.6417, + "step": 7849 + }, + { + "epoch": 2.89785872507999, + "grad_norm": 0.888471245765686, + "learning_rate": 2.657481127346772e-06, + "loss": 0.6503, + "step": 7850 + }, + { + "epoch": 2.8982279104110265, + "grad_norm": 0.8978638052940369, + "learning_rate": 2.656995715423919e-06, + "loss": 0.599, + "step": 7851 + }, + { + "epoch": 2.8985970957420624, + "grad_norm": 0.9754461646080017, + "learning_rate": 2.656510297558827e-06, + "loss": 0.6298, + "step": 7852 + }, + { + "epoch": 2.8989662810730987, + "grad_norm": 0.8921336531639099, + "learning_rate": 2.6560248737698703e-06, + "loss": 0.6087, + "step": 7853 + }, + { + "epoch": 2.8993354664041346, + "grad_norm": 0.9461963772773743, + "learning_rate": 2.655539444075421e-06, + "loss": 0.6015, + "step": 7854 + }, + { + "epoch": 2.899704651735171, + "grad_norm": 0.8876105546951294, + "learning_rate": 2.655054008493852e-06, + "loss": 0.5968, + "step": 7855 + }, + { + "epoch": 2.9000738370662074, + "grad_norm": 0.8754955530166626, + "learning_rate": 2.6545685670435383e-06, + "loss": 0.5753, + "step": 7856 + }, + { + "epoch": 2.9004430223972433, + "grad_norm": 0.8883516788482666, + "learning_rate": 2.6540831197428534e-06, + "loss": 0.6327, + "step": 7857 + }, + { + "epoch": 2.9008122077282796, + "grad_norm": 0.9218360185623169, + "learning_rate": 2.653597666610171e-06, + "loss": 0.6022, + "step": 7858 + }, + { + "epoch": 2.901181393059316, + "grad_norm": 0.9080891013145447, + "learning_rate": 2.653112207663865e-06, + "loss": 0.5874, + "step": 7859 + }, + { + "epoch": 2.901550578390352, + "grad_norm": 0.8966799974441528, + "learning_rate": 2.6526267429223105e-06, + "loss": 0.5769, + "step": 7860 + }, + { + "epoch": 2.9019197637213883, + "grad_norm": 0.9110830426216125, + "learning_rate": 2.6521412724038824e-06, + "loss": 0.6398, + "step": 7861 + }, + { + "epoch": 2.902288949052424, + "grad_norm": 0.9375808835029602, + "learning_rate": 2.6516557961269547e-06, + "loss": 0.6122, + "step": 7862 + }, + { + "epoch": 2.9026581343834605, + "grad_norm": 0.8640148043632507, + "learning_rate": 2.651170314109903e-06, + "loss": 0.6023, + "step": 7863 + }, + { + "epoch": 2.9030273197144965, + "grad_norm": 0.8920549750328064, + "learning_rate": 2.650684826371102e-06, + "loss": 0.6047, + "step": 7864 + }, + { + "epoch": 2.903396505045533, + "grad_norm": 0.8739588260650635, + "learning_rate": 2.6501993329289293e-06, + "loss": 0.5995, + "step": 7865 + }, + { + "epoch": 2.903765690376569, + "grad_norm": 0.8992472290992737, + "learning_rate": 2.649713833801758e-06, + "loss": 0.6382, + "step": 7866 + }, + { + "epoch": 2.904134875707605, + "grad_norm": 0.8911950588226318, + "learning_rate": 2.649228329007967e-06, + "loss": 0.5921, + "step": 7867 + }, + { + "epoch": 2.9045040610386414, + "grad_norm": 0.8968974947929382, + "learning_rate": 2.6487428185659292e-06, + "loss": 0.6083, + "step": 7868 + }, + { + "epoch": 2.904873246369678, + "grad_norm": 0.8766686320304871, + "learning_rate": 2.6482573024940227e-06, + "loss": 0.5975, + "step": 7869 + }, + { + "epoch": 2.9052424317007137, + "grad_norm": 0.8490191102027893, + "learning_rate": 2.6477717808106245e-06, + "loss": 0.6053, + "step": 7870 + }, + { + "epoch": 2.90561161703175, + "grad_norm": 0.8970863223075867, + "learning_rate": 2.6472862535341115e-06, + "loss": 0.6036, + "step": 7871 + }, + { + "epoch": 2.905980802362786, + "grad_norm": 0.9165747761726379, + "learning_rate": 2.6468007206828596e-06, + "loss": 0.6291, + "step": 7872 + }, + { + "epoch": 2.9063499876938224, + "grad_norm": 0.877228856086731, + "learning_rate": 2.646315182275247e-06, + "loss": 0.6209, + "step": 7873 + }, + { + "epoch": 2.9067191730248583, + "grad_norm": 0.893355131149292, + "learning_rate": 2.6458296383296507e-06, + "loss": 0.6074, + "step": 7874 + }, + { + "epoch": 2.9070883583558946, + "grad_norm": 0.8719050884246826, + "learning_rate": 2.6453440888644494e-06, + "loss": 0.625, + "step": 7875 + }, + { + "epoch": 2.907457543686931, + "grad_norm": 0.8988303542137146, + "learning_rate": 2.6448585338980194e-06, + "loss": 0.5946, + "step": 7876 + }, + { + "epoch": 2.907826729017967, + "grad_norm": 0.9143255949020386, + "learning_rate": 2.64437297344874e-06, + "loss": 0.6048, + "step": 7877 + }, + { + "epoch": 2.9081959143490033, + "grad_norm": 0.8487257361412048, + "learning_rate": 2.643887407534989e-06, + "loss": 0.5644, + "step": 7878 + }, + { + "epoch": 2.9085650996800396, + "grad_norm": 0.8713811635971069, + "learning_rate": 2.643401836175145e-06, + "loss": 0.6258, + "step": 7879 + }, + { + "epoch": 2.9089342850110755, + "grad_norm": 0.9243729114532471, + "learning_rate": 2.642916259387586e-06, + "loss": 0.6334, + "step": 7880 + }, + { + "epoch": 2.909303470342112, + "grad_norm": 0.8986839652061462, + "learning_rate": 2.642430677190693e-06, + "loss": 0.574, + "step": 7881 + }, + { + "epoch": 2.909672655673148, + "grad_norm": 0.8826963901519775, + "learning_rate": 2.6419450896028436e-06, + "loss": 0.5857, + "step": 7882 + }, + { + "epoch": 2.910041841004184, + "grad_norm": 0.8903241157531738, + "learning_rate": 2.6414594966424165e-06, + "loss": 0.6297, + "step": 7883 + }, + { + "epoch": 2.91041102633522, + "grad_norm": 0.8797394633293152, + "learning_rate": 2.6409738983277926e-06, + "loss": 0.6173, + "step": 7884 + }, + { + "epoch": 2.9107802116662564, + "grad_norm": 0.9352104663848877, + "learning_rate": 2.640488294677352e-06, + "loss": 0.6324, + "step": 7885 + }, + { + "epoch": 2.911149396997293, + "grad_norm": 0.9153450727462769, + "learning_rate": 2.6400026857094723e-06, + "loss": 0.6232, + "step": 7886 + }, + { + "epoch": 2.9115185823283287, + "grad_norm": 0.9125748872756958, + "learning_rate": 2.6395170714425363e-06, + "loss": 0.6227, + "step": 7887 + }, + { + "epoch": 2.911887767659365, + "grad_norm": 0.8864941000938416, + "learning_rate": 2.639031451894923e-06, + "loss": 0.628, + "step": 7888 + }, + { + "epoch": 2.9122569529904014, + "grad_norm": 0.8797711133956909, + "learning_rate": 2.638545827085012e-06, + "loss": 0.6312, + "step": 7889 + }, + { + "epoch": 2.9126261383214374, + "grad_norm": 1.01199209690094, + "learning_rate": 2.6380601970311858e-06, + "loss": 0.5932, + "step": 7890 + }, + { + "epoch": 2.9129953236524733, + "grad_norm": 0.8764479756355286, + "learning_rate": 2.6375745617518255e-06, + "loss": 0.6369, + "step": 7891 + }, + { + "epoch": 2.9133645089835096, + "grad_norm": 0.9200306534767151, + "learning_rate": 2.6370889212653107e-06, + "loss": 0.6086, + "step": 7892 + }, + { + "epoch": 2.913733694314546, + "grad_norm": 0.868410050868988, + "learning_rate": 2.6366032755900233e-06, + "loss": 0.5939, + "step": 7893 + }, + { + "epoch": 2.914102879645582, + "grad_norm": 1.0550905466079712, + "learning_rate": 2.6361176247443455e-06, + "loss": 0.5988, + "step": 7894 + }, + { + "epoch": 2.9144720649766183, + "grad_norm": 0.9330690503120422, + "learning_rate": 2.6356319687466584e-06, + "loss": 0.6484, + "step": 7895 + }, + { + "epoch": 2.9148412503076546, + "grad_norm": 0.8870779871940613, + "learning_rate": 2.6351463076153444e-06, + "loss": 0.5796, + "step": 7896 + }, + { + "epoch": 2.9152104356386905, + "grad_norm": 0.8916473388671875, + "learning_rate": 2.634660641368785e-06, + "loss": 0.6069, + "step": 7897 + }, + { + "epoch": 2.915579620969727, + "grad_norm": 0.8910322189331055, + "learning_rate": 2.6341749700253633e-06, + "loss": 0.6276, + "step": 7898 + }, + { + "epoch": 2.9159488063007633, + "grad_norm": 0.896709144115448, + "learning_rate": 2.633689293603461e-06, + "loss": 0.6244, + "step": 7899 + }, + { + "epoch": 2.916317991631799, + "grad_norm": 0.9142293930053711, + "learning_rate": 2.6332036121214603e-06, + "loss": 0.6393, + "step": 7900 + }, + { + "epoch": 2.916687176962835, + "grad_norm": 0.8791609406471252, + "learning_rate": 2.6327179255977464e-06, + "loss": 0.6303, + "step": 7901 + }, + { + "epoch": 2.9170563622938714, + "grad_norm": 0.8930336833000183, + "learning_rate": 2.6322322340506994e-06, + "loss": 0.6514, + "step": 7902 + }, + { + "epoch": 2.917425547624908, + "grad_norm": 0.8908393383026123, + "learning_rate": 2.631746537498705e-06, + "loss": 0.6149, + "step": 7903 + }, + { + "epoch": 2.9177947329559437, + "grad_norm": 0.8743337988853455, + "learning_rate": 2.6312608359601453e-06, + "loss": 0.6322, + "step": 7904 + }, + { + "epoch": 2.91816391828698, + "grad_norm": 0.8913321495056152, + "learning_rate": 2.630775129453405e-06, + "loss": 0.6117, + "step": 7905 + }, + { + "epoch": 2.9185331036180164, + "grad_norm": 0.9008121490478516, + "learning_rate": 2.630289417996867e-06, + "loss": 0.613, + "step": 7906 + }, + { + "epoch": 2.9189022889490523, + "grad_norm": 0.9142534136772156, + "learning_rate": 2.6298037016089155e-06, + "loss": 0.6351, + "step": 7907 + }, + { + "epoch": 2.9192714742800887, + "grad_norm": 0.8749108910560608, + "learning_rate": 2.6293179803079354e-06, + "loss": 0.6008, + "step": 7908 + }, + { + "epoch": 2.9196406596111246, + "grad_norm": 0.9044275879859924, + "learning_rate": 2.6288322541123094e-06, + "loss": 0.5938, + "step": 7909 + }, + { + "epoch": 2.920009844942161, + "grad_norm": 0.8686652183532715, + "learning_rate": 2.6283465230404244e-06, + "loss": 0.615, + "step": 7910 + }, + { + "epoch": 2.920379030273197, + "grad_norm": 0.8875068426132202, + "learning_rate": 2.627860787110663e-06, + "loss": 0.6394, + "step": 7911 + }, + { + "epoch": 2.9207482156042333, + "grad_norm": 0.8925892114639282, + "learning_rate": 2.6273750463414122e-06, + "loss": 0.6102, + "step": 7912 + }, + { + "epoch": 2.9211174009352696, + "grad_norm": 1.0221152305603027, + "learning_rate": 2.6268893007510556e-06, + "loss": 0.6071, + "step": 7913 + }, + { + "epoch": 2.9214865862663055, + "grad_norm": 0.8891804814338684, + "learning_rate": 2.626403550357979e-06, + "loss": 0.6124, + "step": 7914 + }, + { + "epoch": 2.921855771597342, + "grad_norm": 0.8828620314598083, + "learning_rate": 2.625917795180568e-06, + "loss": 0.5837, + "step": 7915 + }, + { + "epoch": 2.9222249569283782, + "grad_norm": 0.8946750164031982, + "learning_rate": 2.6254320352372083e-06, + "loss": 0.6363, + "step": 7916 + }, + { + "epoch": 2.922594142259414, + "grad_norm": 0.9002360105514526, + "learning_rate": 2.6249462705462854e-06, + "loss": 0.6031, + "step": 7917 + }, + { + "epoch": 2.9229633275904505, + "grad_norm": 0.894027590751648, + "learning_rate": 2.624460501126186e-06, + "loss": 0.6357, + "step": 7918 + }, + { + "epoch": 2.9233325129214864, + "grad_norm": 0.9176084995269775, + "learning_rate": 2.6239747269952953e-06, + "loss": 0.6627, + "step": 7919 + }, + { + "epoch": 2.923701698252523, + "grad_norm": 0.9201535582542419, + "learning_rate": 2.6234889481720003e-06, + "loss": 0.6209, + "step": 7920 + }, + { + "epoch": 2.9240708835835587, + "grad_norm": 0.8920616507530212, + "learning_rate": 2.6230031646746886e-06, + "loss": 0.5836, + "step": 7921 + }, + { + "epoch": 2.924440068914595, + "grad_norm": 0.8866165280342102, + "learning_rate": 2.6225173765217454e-06, + "loss": 0.6119, + "step": 7922 + }, + { + "epoch": 2.9248092542456314, + "grad_norm": 0.9156001210212708, + "learning_rate": 2.6220315837315573e-06, + "loss": 0.6272, + "step": 7923 + }, + { + "epoch": 2.9251784395766673, + "grad_norm": 0.898540198802948, + "learning_rate": 2.621545786322513e-06, + "loss": 0.6255, + "step": 7924 + }, + { + "epoch": 2.9255476249077037, + "grad_norm": 0.8819402456283569, + "learning_rate": 2.6210599843129993e-06, + "loss": 0.6098, + "step": 7925 + }, + { + "epoch": 2.92591681023874, + "grad_norm": 0.883496105670929, + "learning_rate": 2.6205741777214033e-06, + "loss": 0.6169, + "step": 7926 + }, + { + "epoch": 2.926285995569776, + "grad_norm": 0.9012805819511414, + "learning_rate": 2.6200883665661124e-06, + "loss": 0.6191, + "step": 7927 + }, + { + "epoch": 2.9266551809008123, + "grad_norm": 0.8650801777839661, + "learning_rate": 2.6196025508655152e-06, + "loss": 0.5804, + "step": 7928 + }, + { + "epoch": 2.9270243662318483, + "grad_norm": 0.8968990445137024, + "learning_rate": 2.6191167306379987e-06, + "loss": 0.6086, + "step": 7929 + }, + { + "epoch": 2.9273935515628846, + "grad_norm": 0.9064731001853943, + "learning_rate": 2.6186309059019516e-06, + "loss": 0.6171, + "step": 7930 + }, + { + "epoch": 2.9277627368939205, + "grad_norm": 0.9068061709403992, + "learning_rate": 2.618145076675762e-06, + "loss": 0.644, + "step": 7931 + }, + { + "epoch": 2.928131922224957, + "grad_norm": 0.8923901319503784, + "learning_rate": 2.617659242977819e-06, + "loss": 0.5962, + "step": 7932 + }, + { + "epoch": 2.9285011075559932, + "grad_norm": 0.907606840133667, + "learning_rate": 2.617173404826511e-06, + "loss": 0.6344, + "step": 7933 + }, + { + "epoch": 2.928870292887029, + "grad_norm": 0.9036347270011902, + "learning_rate": 2.616687562240226e-06, + "loss": 0.6135, + "step": 7934 + }, + { + "epoch": 2.9292394782180655, + "grad_norm": 0.8672775626182556, + "learning_rate": 2.6162017152373535e-06, + "loss": 0.6257, + "step": 7935 + }, + { + "epoch": 2.929608663549102, + "grad_norm": 0.8816257119178772, + "learning_rate": 2.6157158638362835e-06, + "loss": 0.6139, + "step": 7936 + }, + { + "epoch": 2.929977848880138, + "grad_norm": 0.9009481072425842, + "learning_rate": 2.615230008055404e-06, + "loss": 0.5948, + "step": 7937 + }, + { + "epoch": 2.930347034211174, + "grad_norm": 0.8790907859802246, + "learning_rate": 2.614744147913105e-06, + "loss": 0.6012, + "step": 7938 + }, + { + "epoch": 2.93071621954221, + "grad_norm": 0.8913215398788452, + "learning_rate": 2.6142582834277767e-06, + "loss": 0.5989, + "step": 7939 + }, + { + "epoch": 2.9310854048732464, + "grad_norm": 0.8801307082176208, + "learning_rate": 2.613772414617808e-06, + "loss": 0.6635, + "step": 7940 + }, + { + "epoch": 2.9314545902042823, + "grad_norm": 0.9095162153244019, + "learning_rate": 2.6132865415015894e-06, + "loss": 0.5798, + "step": 7941 + }, + { + "epoch": 2.9318237755353187, + "grad_norm": 0.8876269459724426, + "learning_rate": 2.6128006640975114e-06, + "loss": 0.6074, + "step": 7942 + }, + { + "epoch": 2.932192960866355, + "grad_norm": 0.9158062934875488, + "learning_rate": 2.6123147824239633e-06, + "loss": 0.5921, + "step": 7943 + }, + { + "epoch": 2.932562146197391, + "grad_norm": 0.8823740482330322, + "learning_rate": 2.611828896499336e-06, + "loss": 0.6083, + "step": 7944 + }, + { + "epoch": 2.9329313315284273, + "grad_norm": 0.8889486789703369, + "learning_rate": 2.6113430063420203e-06, + "loss": 0.6099, + "step": 7945 + }, + { + "epoch": 2.9333005168594637, + "grad_norm": 0.9020202159881592, + "learning_rate": 2.6108571119704078e-06, + "loss": 0.6155, + "step": 7946 + }, + { + "epoch": 2.9336697021904996, + "grad_norm": 0.9094185829162598, + "learning_rate": 2.610371213402888e-06, + "loss": 0.618, + "step": 7947 + }, + { + "epoch": 2.934038887521536, + "grad_norm": 0.9216052889823914, + "learning_rate": 2.6098853106578533e-06, + "loss": 0.6419, + "step": 7948 + }, + { + "epoch": 2.934408072852572, + "grad_norm": 0.8996232748031616, + "learning_rate": 2.6093994037536925e-06, + "loss": 0.6007, + "step": 7949 + }, + { + "epoch": 2.9347772581836082, + "grad_norm": 0.8946903944015503, + "learning_rate": 2.6089134927088005e-06, + "loss": 0.606, + "step": 7950 + }, + { + "epoch": 2.935146443514644, + "grad_norm": 0.8825215697288513, + "learning_rate": 2.6084275775415665e-06, + "loss": 0.6216, + "step": 7951 + }, + { + "epoch": 2.9355156288456805, + "grad_norm": 0.8714663982391357, + "learning_rate": 2.6079416582703833e-06, + "loss": 0.5935, + "step": 7952 + }, + { + "epoch": 2.935884814176717, + "grad_norm": 0.9005582332611084, + "learning_rate": 2.6074557349136412e-06, + "loss": 0.619, + "step": 7953 + }, + { + "epoch": 2.936253999507753, + "grad_norm": 0.8380026817321777, + "learning_rate": 2.606969807489735e-06, + "loss": 0.5946, + "step": 7954 + }, + { + "epoch": 2.936623184838789, + "grad_norm": 0.9166381359100342, + "learning_rate": 2.606483876017054e-06, + "loss": 0.6602, + "step": 7955 + }, + { + "epoch": 2.9369923701698255, + "grad_norm": 0.8955850005149841, + "learning_rate": 2.605997940513993e-06, + "loss": 0.6082, + "step": 7956 + }, + { + "epoch": 2.9373615555008614, + "grad_norm": 0.9143552184104919, + "learning_rate": 2.605512000998943e-06, + "loss": 0.6173, + "step": 7957 + }, + { + "epoch": 2.9377307408318973, + "grad_norm": 0.8913809061050415, + "learning_rate": 2.605026057490297e-06, + "loss": 0.6014, + "step": 7958 + }, + { + "epoch": 2.9380999261629337, + "grad_norm": 0.8709903359413147, + "learning_rate": 2.604540110006448e-06, + "loss": 0.6179, + "step": 7959 + }, + { + "epoch": 2.93846911149397, + "grad_norm": 0.8951790928840637, + "learning_rate": 2.6040541585657885e-06, + "loss": 0.5938, + "step": 7960 + }, + { + "epoch": 2.938838296825006, + "grad_norm": 0.8832489252090454, + "learning_rate": 2.6035682031867122e-06, + "loss": 0.6087, + "step": 7961 + }, + { + "epoch": 2.9392074821560423, + "grad_norm": 0.8776866793632507, + "learning_rate": 2.6030822438876124e-06, + "loss": 0.5751, + "step": 7962 + }, + { + "epoch": 2.9395766674870787, + "grad_norm": 0.8861465454101562, + "learning_rate": 2.6025962806868814e-06, + "loss": 0.5893, + "step": 7963 + }, + { + "epoch": 2.9399458528181146, + "grad_norm": 0.9056015610694885, + "learning_rate": 2.6021103136029143e-06, + "loss": 0.6305, + "step": 7964 + }, + { + "epoch": 2.940315038149151, + "grad_norm": 0.8963605761528015, + "learning_rate": 2.6016243426541036e-06, + "loss": 0.6181, + "step": 7965 + }, + { + "epoch": 2.9406842234801873, + "grad_norm": 0.8999082446098328, + "learning_rate": 2.6011383678588443e-06, + "loss": 0.6194, + "step": 7966 + }, + { + "epoch": 2.9410534088112232, + "grad_norm": 0.9013026356697083, + "learning_rate": 2.600652389235529e-06, + "loss": 0.599, + "step": 7967 + }, + { + "epoch": 2.941422594142259, + "grad_norm": 0.8831832408905029, + "learning_rate": 2.6001664068025523e-06, + "loss": 0.6086, + "step": 7968 + }, + { + "epoch": 2.9417917794732955, + "grad_norm": 0.8992711901664734, + "learning_rate": 2.599680420578309e-06, + "loss": 0.5824, + "step": 7969 + }, + { + "epoch": 2.942160964804332, + "grad_norm": 0.8920653462409973, + "learning_rate": 2.5991944305811937e-06, + "loss": 0.6068, + "step": 7970 + }, + { + "epoch": 2.942530150135368, + "grad_norm": 0.8879903554916382, + "learning_rate": 2.5987084368296e-06, + "loss": 0.6127, + "step": 7971 + }, + { + "epoch": 2.942899335466404, + "grad_norm": 0.9111250638961792, + "learning_rate": 2.598222439341924e-06, + "loss": 0.6132, + "step": 7972 + }, + { + "epoch": 2.9432685207974405, + "grad_norm": 0.8898036479949951, + "learning_rate": 2.597736438136558e-06, + "loss": 0.6059, + "step": 7973 + }, + { + "epoch": 2.9436377061284764, + "grad_norm": 0.8983686566352844, + "learning_rate": 2.5972504332319004e-06, + "loss": 0.6059, + "step": 7974 + }, + { + "epoch": 2.9440068914595128, + "grad_norm": 0.9212443232536316, + "learning_rate": 2.5967644246463437e-06, + "loss": 0.6088, + "step": 7975 + }, + { + "epoch": 2.9443760767905487, + "grad_norm": 0.9017109870910645, + "learning_rate": 2.5962784123982843e-06, + "loss": 0.6568, + "step": 7976 + }, + { + "epoch": 2.944745262121585, + "grad_norm": 0.9016318321228027, + "learning_rate": 2.595792396506117e-06, + "loss": 0.6102, + "step": 7977 + }, + { + "epoch": 2.945114447452621, + "grad_norm": 0.9226755499839783, + "learning_rate": 2.595306376988238e-06, + "loss": 0.6275, + "step": 7978 + }, + { + "epoch": 2.9454836327836573, + "grad_norm": 0.8872864842414856, + "learning_rate": 2.594820353863043e-06, + "loss": 0.6105, + "step": 7979 + }, + { + "epoch": 2.9458528181146937, + "grad_norm": 0.9071178436279297, + "learning_rate": 2.5943343271489274e-06, + "loss": 0.6061, + "step": 7980 + }, + { + "epoch": 2.9462220034457296, + "grad_norm": 0.8726102113723755, + "learning_rate": 2.593848296864287e-06, + "loss": 0.6128, + "step": 7981 + }, + { + "epoch": 2.946591188776766, + "grad_norm": 0.897532045841217, + "learning_rate": 2.593362263027518e-06, + "loss": 0.6247, + "step": 7982 + }, + { + "epoch": 2.9469603741078023, + "grad_norm": 0.8939472436904907, + "learning_rate": 2.5928762256570174e-06, + "loss": 0.613, + "step": 7983 + }, + { + "epoch": 2.9473295594388382, + "grad_norm": 0.8735429048538208, + "learning_rate": 2.5923901847711806e-06, + "loss": 0.6534, + "step": 7984 + }, + { + "epoch": 2.9476987447698746, + "grad_norm": 0.8887305855751038, + "learning_rate": 2.5919041403884043e-06, + "loss": 0.6467, + "step": 7985 + }, + { + "epoch": 2.9480679301009105, + "grad_norm": 0.9169960021972656, + "learning_rate": 2.591418092527086e-06, + "loss": 0.6375, + "step": 7986 + }, + { + "epoch": 2.948437115431947, + "grad_norm": 0.8926560282707214, + "learning_rate": 2.5909320412056203e-06, + "loss": 0.6252, + "step": 7987 + }, + { + "epoch": 2.948806300762983, + "grad_norm": 0.9050718545913696, + "learning_rate": 2.5904459864424074e-06, + "loss": 0.6082, + "step": 7988 + }, + { + "epoch": 2.949175486094019, + "grad_norm": 0.9012325406074524, + "learning_rate": 2.5899599282558413e-06, + "loss": 0.6074, + "step": 7989 + }, + { + "epoch": 2.9495446714250555, + "grad_norm": 0.9230538606643677, + "learning_rate": 2.589473866664321e-06, + "loss": 0.6261, + "step": 7990 + }, + { + "epoch": 2.9499138567560914, + "grad_norm": 0.8872868418693542, + "learning_rate": 2.5889878016862426e-06, + "loss": 0.5934, + "step": 7991 + }, + { + "epoch": 2.9502830420871278, + "grad_norm": 0.9062955975532532, + "learning_rate": 2.5885017333400042e-06, + "loss": 0.6236, + "step": 7992 + }, + { + "epoch": 2.950652227418164, + "grad_norm": 0.9030137658119202, + "learning_rate": 2.588015661644003e-06, + "loss": 0.6422, + "step": 7993 + }, + { + "epoch": 2.9510214127492, + "grad_norm": 0.8809729218482971, + "learning_rate": 2.5875295866166368e-06, + "loss": 0.5964, + "step": 7994 + }, + { + "epoch": 2.9513905980802364, + "grad_norm": 0.9118802547454834, + "learning_rate": 2.5870435082763037e-06, + "loss": 0.6285, + "step": 7995 + }, + { + "epoch": 2.9517597834112723, + "grad_norm": 0.8944644927978516, + "learning_rate": 2.5865574266414007e-06, + "loss": 0.6161, + "step": 7996 + }, + { + "epoch": 2.9521289687423087, + "grad_norm": 0.8674063682556152, + "learning_rate": 2.5860713417303264e-06, + "loss": 0.5786, + "step": 7997 + }, + { + "epoch": 2.9524981540733446, + "grad_norm": 0.8869565725326538, + "learning_rate": 2.5855852535614796e-06, + "loss": 0.6109, + "step": 7998 + }, + { + "epoch": 2.952867339404381, + "grad_norm": 0.9028680920600891, + "learning_rate": 2.5850991621532572e-06, + "loss": 0.6281, + "step": 7999 + }, + { + "epoch": 2.9532365247354173, + "grad_norm": 0.8684206008911133, + "learning_rate": 2.5846130675240596e-06, + "loss": 0.5689, + "step": 8000 + }, + { + "epoch": 2.9536057100664532, + "grad_norm": 0.9036666750907898, + "learning_rate": 2.584126969692282e-06, + "loss": 0.6003, + "step": 8001 + }, + { + "epoch": 2.9539748953974896, + "grad_norm": 0.8974305391311646, + "learning_rate": 2.583640868676327e-06, + "loss": 0.6194, + "step": 8002 + }, + { + "epoch": 2.954344080728526, + "grad_norm": 0.880155086517334, + "learning_rate": 2.583154764494591e-06, + "loss": 0.6518, + "step": 8003 + }, + { + "epoch": 2.954713266059562, + "grad_norm": 0.9045276045799255, + "learning_rate": 2.5826686571654734e-06, + "loss": 0.6168, + "step": 8004 + }, + { + "epoch": 2.955082451390598, + "grad_norm": 0.8820865750312805, + "learning_rate": 2.582182546707373e-06, + "loss": 0.6263, + "step": 8005 + }, + { + "epoch": 2.955451636721634, + "grad_norm": 0.9026426672935486, + "learning_rate": 2.581696433138689e-06, + "loss": 0.5762, + "step": 8006 + }, + { + "epoch": 2.9558208220526705, + "grad_norm": 0.900384247303009, + "learning_rate": 2.581210316477821e-06, + "loss": 0.6166, + "step": 8007 + }, + { + "epoch": 2.9561900073837064, + "grad_norm": 0.8755916357040405, + "learning_rate": 2.580724196743169e-06, + "loss": 0.5846, + "step": 8008 + }, + { + "epoch": 2.9565591927147428, + "grad_norm": 0.8778271079063416, + "learning_rate": 2.580238073953131e-06, + "loss": 0.5887, + "step": 8009 + }, + { + "epoch": 2.956928378045779, + "grad_norm": 0.8947851657867432, + "learning_rate": 2.5797519481261067e-06, + "loss": 0.5939, + "step": 8010 + }, + { + "epoch": 2.957297563376815, + "grad_norm": 0.8996351957321167, + "learning_rate": 2.5792658192804965e-06, + "loss": 0.6209, + "step": 8011 + }, + { + "epoch": 2.9576667487078514, + "grad_norm": 0.8864438533782959, + "learning_rate": 2.5787796874347006e-06, + "loss": 0.6327, + "step": 8012 + }, + { + "epoch": 2.9580359340388878, + "grad_norm": 0.8772410154342651, + "learning_rate": 2.578293552607118e-06, + "loss": 0.5827, + "step": 8013 + }, + { + "epoch": 2.9584051193699237, + "grad_norm": 0.8913734555244446, + "learning_rate": 2.57780741481615e-06, + "loss": 0.6237, + "step": 8014 + }, + { + "epoch": 2.95877430470096, + "grad_norm": 0.883000373840332, + "learning_rate": 2.5773212740801947e-06, + "loss": 0.6169, + "step": 8015 + }, + { + "epoch": 2.959143490031996, + "grad_norm": 0.8887459635734558, + "learning_rate": 2.5768351304176535e-06, + "loss": 0.6307, + "step": 8016 + }, + { + "epoch": 2.9595126753630323, + "grad_norm": 0.9014142751693726, + "learning_rate": 2.5763489838469276e-06, + "loss": 0.5823, + "step": 8017 + }, + { + "epoch": 2.9598818606940682, + "grad_norm": 0.8622140884399414, + "learning_rate": 2.5758628343864167e-06, + "loss": 0.5946, + "step": 8018 + }, + { + "epoch": 2.9602510460251046, + "grad_norm": 0.9046510457992554, + "learning_rate": 2.5753766820545212e-06, + "loss": 0.5971, + "step": 8019 + }, + { + "epoch": 2.960620231356141, + "grad_norm": 0.8918441534042358, + "learning_rate": 2.5748905268696422e-06, + "loss": 0.5987, + "step": 8020 + }, + { + "epoch": 2.960989416687177, + "grad_norm": 0.9170805215835571, + "learning_rate": 2.57440436885018e-06, + "loss": 0.6267, + "step": 8021 + }, + { + "epoch": 2.961358602018213, + "grad_norm": 0.891851007938385, + "learning_rate": 2.573918208014537e-06, + "loss": 0.5891, + "step": 8022 + }, + { + "epoch": 2.9617277873492496, + "grad_norm": 0.8783304691314697, + "learning_rate": 2.5734320443811123e-06, + "loss": 0.574, + "step": 8023 + }, + { + "epoch": 2.9620969726802855, + "grad_norm": 0.891708493232727, + "learning_rate": 2.572945877968308e-06, + "loss": 0.6031, + "step": 8024 + }, + { + "epoch": 2.962466158011322, + "grad_norm": 0.895087480545044, + "learning_rate": 2.572459708794525e-06, + "loss": 0.6096, + "step": 8025 + }, + { + "epoch": 2.9628353433423578, + "grad_norm": 0.9306208491325378, + "learning_rate": 2.571973536878165e-06, + "loss": 0.6448, + "step": 8026 + }, + { + "epoch": 2.963204528673394, + "grad_norm": 0.8872064352035522, + "learning_rate": 2.5714873622376296e-06, + "loss": 0.6484, + "step": 8027 + }, + { + "epoch": 2.96357371400443, + "grad_norm": 0.9216285347938538, + "learning_rate": 2.5710011848913206e-06, + "loss": 0.5938, + "step": 8028 + }, + { + "epoch": 2.9639428993354664, + "grad_norm": 0.9098814129829407, + "learning_rate": 2.5705150048576382e-06, + "loss": 0.6299, + "step": 8029 + }, + { + "epoch": 2.9643120846665028, + "grad_norm": 0.916236400604248, + "learning_rate": 2.5700288221549855e-06, + "loss": 0.623, + "step": 8030 + }, + { + "epoch": 2.9646812699975387, + "grad_norm": 0.8870810866355896, + "learning_rate": 2.569542636801764e-06, + "loss": 0.6003, + "step": 8031 + }, + { + "epoch": 2.965050455328575, + "grad_norm": 0.8939295411109924, + "learning_rate": 2.569056448816376e-06, + "loss": 0.6067, + "step": 8032 + }, + { + "epoch": 2.9654196406596114, + "grad_norm": 0.8931148648262024, + "learning_rate": 2.568570258217222e-06, + "loss": 0.5526, + "step": 8033 + }, + { + "epoch": 2.9657888259906473, + "grad_norm": 0.8919257521629333, + "learning_rate": 2.5680840650227074e-06, + "loss": 0.6472, + "step": 8034 + }, + { + "epoch": 2.966158011321683, + "grad_norm": 0.9123396873474121, + "learning_rate": 2.5675978692512304e-06, + "loss": 0.6387, + "step": 8035 + }, + { + "epoch": 2.9665271966527196, + "grad_norm": 0.9230684041976929, + "learning_rate": 2.5671116709211963e-06, + "loss": 0.6326, + "step": 8036 + }, + { + "epoch": 2.966896381983756, + "grad_norm": 0.8538432121276855, + "learning_rate": 2.5666254700510066e-06, + "loss": 0.6209, + "step": 8037 + }, + { + "epoch": 2.967265567314792, + "grad_norm": 0.9165694713592529, + "learning_rate": 2.566139266659064e-06, + "loss": 0.6047, + "step": 8038 + }, + { + "epoch": 2.967634752645828, + "grad_norm": 0.8980461359024048, + "learning_rate": 2.5656530607637703e-06, + "loss": 0.5962, + "step": 8039 + }, + { + "epoch": 2.9680039379768646, + "grad_norm": 0.9002764225006104, + "learning_rate": 2.56516685238353e-06, + "loss": 0.6513, + "step": 8040 + }, + { + "epoch": 2.9683731233079005, + "grad_norm": 0.8860543966293335, + "learning_rate": 2.564680641536744e-06, + "loss": 0.5915, + "step": 8041 + }, + { + "epoch": 2.968742308638937, + "grad_norm": 0.914444625377655, + "learning_rate": 2.5641944282418174e-06, + "loss": 0.6261, + "step": 8042 + }, + { + "epoch": 2.969111493969973, + "grad_norm": 0.882707953453064, + "learning_rate": 2.5637082125171507e-06, + "loss": 0.5835, + "step": 8043 + }, + { + "epoch": 2.969480679301009, + "grad_norm": 0.8757317066192627, + "learning_rate": 2.5632219943811486e-06, + "loss": 0.6108, + "step": 8044 + }, + { + "epoch": 2.969849864632045, + "grad_norm": 0.9100270867347717, + "learning_rate": 2.562735773852214e-06, + "loss": 0.6404, + "step": 8045 + }, + { + "epoch": 2.9702190499630814, + "grad_norm": 0.9162290692329407, + "learning_rate": 2.5622495509487503e-06, + "loss": 0.6277, + "step": 8046 + }, + { + "epoch": 2.9705882352941178, + "grad_norm": 0.9124503135681152, + "learning_rate": 2.5617633256891604e-06, + "loss": 0.6236, + "step": 8047 + }, + { + "epoch": 2.9709574206251537, + "grad_norm": 0.9117345809936523, + "learning_rate": 2.561277098091849e-06, + "loss": 0.6111, + "step": 8048 + }, + { + "epoch": 2.97132660595619, + "grad_norm": 0.8786841630935669, + "learning_rate": 2.5607908681752177e-06, + "loss": 0.579, + "step": 8049 + }, + { + "epoch": 2.9716957912872264, + "grad_norm": 0.8999996781349182, + "learning_rate": 2.560304635957672e-06, + "loss": 0.6213, + "step": 8050 + }, + { + "epoch": 2.9720649766182623, + "grad_norm": 0.9027636051177979, + "learning_rate": 2.5598184014576144e-06, + "loss": 0.6353, + "step": 8051 + }, + { + "epoch": 2.9724341619492987, + "grad_norm": 0.9024510383605957, + "learning_rate": 2.5593321646934494e-06, + "loss": 0.6443, + "step": 8052 + }, + { + "epoch": 2.9728033472803346, + "grad_norm": 0.9000137448310852, + "learning_rate": 2.5588459256835806e-06, + "loss": 0.6093, + "step": 8053 + }, + { + "epoch": 2.973172532611371, + "grad_norm": 0.8742557764053345, + "learning_rate": 2.558359684446412e-06, + "loss": 0.6077, + "step": 8054 + }, + { + "epoch": 2.973541717942407, + "grad_norm": 0.8946438431739807, + "learning_rate": 2.5578734410003475e-06, + "loss": 0.6303, + "step": 8055 + }, + { + "epoch": 2.973910903273443, + "grad_norm": 0.9123812913894653, + "learning_rate": 2.5573871953637923e-06, + "loss": 0.6238, + "step": 8056 + }, + { + "epoch": 2.9742800886044796, + "grad_norm": 0.8820621967315674, + "learning_rate": 2.5569009475551494e-06, + "loss": 0.6014, + "step": 8057 + }, + { + "epoch": 2.9746492739355155, + "grad_norm": 0.9122143387794495, + "learning_rate": 2.556414697592823e-06, + "loss": 0.621, + "step": 8058 + }, + { + "epoch": 2.975018459266552, + "grad_norm": 0.8717246651649475, + "learning_rate": 2.555928445495219e-06, + "loss": 0.5869, + "step": 8059 + }, + { + "epoch": 2.975387644597588, + "grad_norm": 0.8876007795333862, + "learning_rate": 2.5554421912807414e-06, + "loss": 0.5998, + "step": 8060 + }, + { + "epoch": 2.975756829928624, + "grad_norm": 0.9008484482765198, + "learning_rate": 2.5549559349677933e-06, + "loss": 0.6212, + "step": 8061 + }, + { + "epoch": 2.9761260152596605, + "grad_norm": 0.8849285244941711, + "learning_rate": 2.5544696765747813e-06, + "loss": 0.5957, + "step": 8062 + }, + { + "epoch": 2.9764952005906964, + "grad_norm": 0.8932204246520996, + "learning_rate": 2.5539834161201084e-06, + "loss": 0.5733, + "step": 8063 + }, + { + "epoch": 2.9768643859217327, + "grad_norm": 0.926655650138855, + "learning_rate": 2.5534971536221804e-06, + "loss": 0.6178, + "step": 8064 + }, + { + "epoch": 2.9772335712527687, + "grad_norm": 0.9062601327896118, + "learning_rate": 2.5530108890994022e-06, + "loss": 0.6427, + "step": 8065 + }, + { + "epoch": 2.977602756583805, + "grad_norm": 0.8835027813911438, + "learning_rate": 2.552524622570179e-06, + "loss": 0.594, + "step": 8066 + }, + { + "epoch": 2.9779719419148414, + "grad_norm": 0.9006915092468262, + "learning_rate": 2.552038354052915e-06, + "loss": 0.6093, + "step": 8067 + }, + { + "epoch": 2.9783411272458773, + "grad_norm": 0.9037712216377258, + "learning_rate": 2.551552083566015e-06, + "loss": 0.6628, + "step": 8068 + }, + { + "epoch": 2.9787103125769137, + "grad_norm": 0.8693256974220276, + "learning_rate": 2.5510658111278857e-06, + "loss": 0.6405, + "step": 8069 + }, + { + "epoch": 2.97907949790795, + "grad_norm": 0.889022171497345, + "learning_rate": 2.550579536756932e-06, + "loss": 0.5984, + "step": 8070 + }, + { + "epoch": 2.979448683238986, + "grad_norm": 0.8871784210205078, + "learning_rate": 2.5500932604715577e-06, + "loss": 0.6112, + "step": 8071 + }, + { + "epoch": 2.9798178685700223, + "grad_norm": 1.015604019165039, + "learning_rate": 2.5496069822901704e-06, + "loss": 0.6213, + "step": 8072 + }, + { + "epoch": 2.980187053901058, + "grad_norm": 0.8940048813819885, + "learning_rate": 2.5491207022311737e-06, + "loss": 0.588, + "step": 8073 + }, + { + "epoch": 2.9805562392320946, + "grad_norm": 0.874444305896759, + "learning_rate": 2.5486344203129744e-06, + "loss": 0.6145, + "step": 8074 + }, + { + "epoch": 2.9809254245631305, + "grad_norm": 0.8978701829910278, + "learning_rate": 2.5481481365539773e-06, + "loss": 0.6074, + "step": 8075 + }, + { + "epoch": 2.981294609894167, + "grad_norm": 0.87534499168396, + "learning_rate": 2.5476618509725887e-06, + "loss": 0.6204, + "step": 8076 + }, + { + "epoch": 2.981663795225203, + "grad_norm": 0.9143137335777283, + "learning_rate": 2.547175563587213e-06, + "loss": 0.6261, + "step": 8077 + }, + { + "epoch": 2.982032980556239, + "grad_norm": 0.9373952150344849, + "learning_rate": 2.5466892744162586e-06, + "loss": 0.5984, + "step": 8078 + }, + { + "epoch": 2.9824021658872755, + "grad_norm": 0.8895273804664612, + "learning_rate": 2.546202983478129e-06, + "loss": 0.5943, + "step": 8079 + }, + { + "epoch": 2.982771351218312, + "grad_norm": 0.8986215591430664, + "learning_rate": 2.5457166907912323e-06, + "loss": 0.6219, + "step": 8080 + }, + { + "epoch": 2.9831405365493477, + "grad_norm": 0.8935422897338867, + "learning_rate": 2.545230396373972e-06, + "loss": 0.644, + "step": 8081 + }, + { + "epoch": 2.983509721880384, + "grad_norm": 0.9239478707313538, + "learning_rate": 2.5447441002447555e-06, + "loss": 0.6262, + "step": 8082 + }, + { + "epoch": 2.98387890721142, + "grad_norm": 0.9033856391906738, + "learning_rate": 2.5442578024219892e-06, + "loss": 0.6139, + "step": 8083 + }, + { + "epoch": 2.9842480925424564, + "grad_norm": 0.8906466960906982, + "learning_rate": 2.5437715029240793e-06, + "loss": 0.6386, + "step": 8084 + }, + { + "epoch": 2.9846172778734923, + "grad_norm": 0.8861475586891174, + "learning_rate": 2.5432852017694315e-06, + "loss": 0.6781, + "step": 8085 + }, + { + "epoch": 2.9849864632045287, + "grad_norm": 0.9099554419517517, + "learning_rate": 2.542798898976453e-06, + "loss": 0.6234, + "step": 8086 + }, + { + "epoch": 2.985355648535565, + "grad_norm": 0.9200599193572998, + "learning_rate": 2.542312594563549e-06, + "loss": 0.6207, + "step": 8087 + }, + { + "epoch": 2.985724833866601, + "grad_norm": 0.9025746583938599, + "learning_rate": 2.5418262885491272e-06, + "loss": 0.6173, + "step": 8088 + }, + { + "epoch": 2.9860940191976373, + "grad_norm": 0.8911523222923279, + "learning_rate": 2.541339980951593e-06, + "loss": 0.6051, + "step": 8089 + }, + { + "epoch": 2.9864632045286736, + "grad_norm": 0.9237908124923706, + "learning_rate": 2.5408536717893545e-06, + "loss": 0.6374, + "step": 8090 + }, + { + "epoch": 2.9868323898597096, + "grad_norm": 0.8824042081832886, + "learning_rate": 2.540367361080816e-06, + "loss": 0.6142, + "step": 8091 + }, + { + "epoch": 2.987201575190746, + "grad_norm": 0.9068201780319214, + "learning_rate": 2.539881048844387e-06, + "loss": 0.6063, + "step": 8092 + }, + { + "epoch": 2.987570760521782, + "grad_norm": 0.886593222618103, + "learning_rate": 2.539394735098472e-06, + "loss": 0.6116, + "step": 8093 + }, + { + "epoch": 2.987939945852818, + "grad_norm": 0.9119499921798706, + "learning_rate": 2.538908419861479e-06, + "loss": 0.628, + "step": 8094 + }, + { + "epoch": 2.988309131183854, + "grad_norm": 0.8999724984169006, + "learning_rate": 2.5384221031518145e-06, + "loss": 0.6081, + "step": 8095 + }, + { + "epoch": 2.9886783165148905, + "grad_norm": 0.8864060640335083, + "learning_rate": 2.537935784987886e-06, + "loss": 0.6229, + "step": 8096 + }, + { + "epoch": 2.989047501845927, + "grad_norm": 0.9082754850387573, + "learning_rate": 2.5374494653880993e-06, + "loss": 0.6066, + "step": 8097 + }, + { + "epoch": 2.9894166871769627, + "grad_norm": 0.8819117546081543, + "learning_rate": 2.536963144370863e-06, + "loss": 0.6057, + "step": 8098 + }, + { + "epoch": 2.989785872507999, + "grad_norm": 0.9041752219200134, + "learning_rate": 2.5364768219545827e-06, + "loss": 0.5945, + "step": 8099 + }, + { + "epoch": 2.9901550578390355, + "grad_norm": 0.8961989283561707, + "learning_rate": 2.5359904981576667e-06, + "loss": 0.6268, + "step": 8100 + }, + { + "epoch": 2.9905242431700714, + "grad_norm": 0.8928650617599487, + "learning_rate": 2.535504172998521e-06, + "loss": 0.6031, + "step": 8101 + }, + { + "epoch": 2.9908934285011073, + "grad_norm": 0.8879072666168213, + "learning_rate": 2.5350178464955537e-06, + "loss": 0.6045, + "step": 8102 + }, + { + "epoch": 2.9912626138321436, + "grad_norm": 0.897849440574646, + "learning_rate": 2.534531518667172e-06, + "loss": 0.6082, + "step": 8103 + }, + { + "epoch": 2.99163179916318, + "grad_norm": 0.9112852215766907, + "learning_rate": 2.534045189531784e-06, + "loss": 0.6349, + "step": 8104 + }, + { + "epoch": 2.992000984494216, + "grad_norm": 0.8612932562828064, + "learning_rate": 2.5335588591077952e-06, + "loss": 0.5522, + "step": 8105 + }, + { + "epoch": 2.9923701698252523, + "grad_norm": 0.8903751373291016, + "learning_rate": 2.5330725274136146e-06, + "loss": 0.5692, + "step": 8106 + }, + { + "epoch": 2.9927393551562886, + "grad_norm": 0.9291948080062866, + "learning_rate": 2.5325861944676492e-06, + "loss": 0.6041, + "step": 8107 + }, + { + "epoch": 2.9931085404873246, + "grad_norm": 0.854997456073761, + "learning_rate": 2.5320998602883073e-06, + "loss": 0.609, + "step": 8108 + }, + { + "epoch": 2.993477725818361, + "grad_norm": 0.9368383288383484, + "learning_rate": 2.5316135248939954e-06, + "loss": 0.6522, + "step": 8109 + }, + { + "epoch": 2.9938469111493973, + "grad_norm": 0.8973108530044556, + "learning_rate": 2.5311271883031215e-06, + "loss": 0.6364, + "step": 8110 + }, + { + "epoch": 2.994216096480433, + "grad_norm": 0.9252563118934631, + "learning_rate": 2.530640850534093e-06, + "loss": 0.6192, + "step": 8111 + }, + { + "epoch": 2.994585281811469, + "grad_norm": 0.9158481955528259, + "learning_rate": 2.5301545116053187e-06, + "loss": 0.6364, + "step": 8112 + }, + { + "epoch": 2.9949544671425055, + "grad_norm": 0.9081293344497681, + "learning_rate": 2.529668171535205e-06, + "loss": 0.6449, + "step": 8113 + }, + { + "epoch": 2.995323652473542, + "grad_norm": 0.9116506576538086, + "learning_rate": 2.5291818303421613e-06, + "loss": 0.6183, + "step": 8114 + }, + { + "epoch": 2.9956928378045777, + "grad_norm": 0.920960009098053, + "learning_rate": 2.528695488044593e-06, + "loss": 0.6051, + "step": 8115 + }, + { + "epoch": 2.996062023135614, + "grad_norm": 0.9272328019142151, + "learning_rate": 2.528209144660911e-06, + "loss": 0.6254, + "step": 8116 + }, + { + "epoch": 2.9964312084666505, + "grad_norm": 0.9068505764007568, + "learning_rate": 2.5277228002095205e-06, + "loss": 0.6455, + "step": 8117 + }, + { + "epoch": 2.9968003937976864, + "grad_norm": 0.922822117805481, + "learning_rate": 2.527236454708832e-06, + "loss": 0.6513, + "step": 8118 + }, + { + "epoch": 2.9971695791287227, + "grad_norm": 0.8910556435585022, + "learning_rate": 2.5267501081772518e-06, + "loss": 0.5811, + "step": 8119 + }, + { + "epoch": 2.9975387644597586, + "grad_norm": 0.8846920728683472, + "learning_rate": 2.526263760633188e-06, + "loss": 0.6059, + "step": 8120 + }, + { + "epoch": 2.997907949790795, + "grad_norm": 0.8970828056335449, + "learning_rate": 2.525777412095049e-06, + "loss": 0.6328, + "step": 8121 + }, + { + "epoch": 2.998277135121831, + "grad_norm": 0.874272882938385, + "learning_rate": 2.5252910625812443e-06, + "loss": 0.5984, + "step": 8122 + }, + { + "epoch": 2.9986463204528673, + "grad_norm": 0.8831865191459656, + "learning_rate": 2.52480471211018e-06, + "loss": 0.6182, + "step": 8123 + }, + { + "epoch": 2.9990155057839036, + "grad_norm": 0.8962333798408508, + "learning_rate": 2.5243183607002657e-06, + "loss": 0.59, + "step": 8124 + }, + { + "epoch": 2.9993846911149395, + "grad_norm": 0.8764786720275879, + "learning_rate": 2.523832008369908e-06, + "loss": 0.6257, + "step": 8125 + }, + { + "epoch": 2.999753876445976, + "grad_norm": 0.8796412944793701, + "learning_rate": 2.5233456551375172e-06, + "loss": 0.5808, + "step": 8126 + }, + { + "epoch": 3.0, + "grad_norm": 1.2153412103652954, + "learning_rate": 2.5228593010215007e-06, + "loss": 0.651, + "step": 8127 + }, + { + "epoch": 3.0003691853310364, + "grad_norm": 0.8833327889442444, + "learning_rate": 2.5223729460402668e-06, + "loss": 0.6005, + "step": 8128 + }, + { + "epoch": 3.0007383706620723, + "grad_norm": 0.871941864490509, + "learning_rate": 2.5218865902122243e-06, + "loss": 0.5574, + "step": 8129 + }, + { + "epoch": 3.0011075559931086, + "grad_norm": 0.8639472126960754, + "learning_rate": 2.5214002335557803e-06, + "loss": 0.5831, + "step": 8130 + }, + { + "epoch": 3.0014767413241445, + "grad_norm": 0.8731513023376465, + "learning_rate": 2.520913876089345e-06, + "loss": 0.5771, + "step": 8131 + }, + { + "epoch": 3.001845926655181, + "grad_norm": 0.8613303303718567, + "learning_rate": 2.520427517831326e-06, + "loss": 0.5668, + "step": 8132 + }, + { + "epoch": 3.0022151119862173, + "grad_norm": 0.8824806809425354, + "learning_rate": 2.5199411588001317e-06, + "loss": 0.5614, + "step": 8133 + }, + { + "epoch": 3.002584297317253, + "grad_norm": 0.8802688717842102, + "learning_rate": 2.5194547990141702e-06, + "loss": 0.5729, + "step": 8134 + }, + { + "epoch": 3.0029534826482895, + "grad_norm": 0.9029601216316223, + "learning_rate": 2.5189684384918517e-06, + "loss": 0.5543, + "step": 8135 + }, + { + "epoch": 3.0033226679793255, + "grad_norm": 0.8903936147689819, + "learning_rate": 2.518482077251584e-06, + "loss": 0.5814, + "step": 8136 + }, + { + "epoch": 3.003691853310362, + "grad_norm": 0.9269198775291443, + "learning_rate": 2.5179957153117747e-06, + "loss": 0.5808, + "step": 8137 + }, + { + "epoch": 3.004061038641398, + "grad_norm": 0.9027205109596252, + "learning_rate": 2.517509352690834e-06, + "loss": 0.5869, + "step": 8138 + }, + { + "epoch": 3.004430223972434, + "grad_norm": 0.92210453748703, + "learning_rate": 2.5170229894071692e-06, + "loss": 0.6018, + "step": 8139 + }, + { + "epoch": 3.0047994093034704, + "grad_norm": 0.9475352764129639, + "learning_rate": 2.5165366254791896e-06, + "loss": 0.6047, + "step": 8140 + }, + { + "epoch": 3.0051685946345064, + "grad_norm": 0.8668087124824524, + "learning_rate": 2.5160502609253042e-06, + "loss": 0.5668, + "step": 8141 + }, + { + "epoch": 3.0055377799655427, + "grad_norm": 0.9387706518173218, + "learning_rate": 2.5155638957639218e-06, + "loss": 0.6123, + "step": 8142 + }, + { + "epoch": 3.005906965296579, + "grad_norm": 0.9103645086288452, + "learning_rate": 2.51507753001345e-06, + "loss": 0.5728, + "step": 8143 + }, + { + "epoch": 3.006276150627615, + "grad_norm": 0.8924869894981384, + "learning_rate": 2.5145911636922994e-06, + "loss": 0.6063, + "step": 8144 + }, + { + "epoch": 3.0066453359586514, + "grad_norm": 0.9073446989059448, + "learning_rate": 2.514104796818877e-06, + "loss": 0.5893, + "step": 8145 + }, + { + "epoch": 3.0070145212896873, + "grad_norm": 0.9006859660148621, + "learning_rate": 2.5136184294115934e-06, + "loss": 0.5673, + "step": 8146 + }, + { + "epoch": 3.0073837066207236, + "grad_norm": 0.8980212211608887, + "learning_rate": 2.5131320614888556e-06, + "loss": 0.5667, + "step": 8147 + }, + { + "epoch": 3.00775289195176, + "grad_norm": 0.9022326469421387, + "learning_rate": 2.5126456930690734e-06, + "loss": 0.599, + "step": 8148 + }, + { + "epoch": 3.008122077282796, + "grad_norm": 0.9074740409851074, + "learning_rate": 2.512159324170657e-06, + "loss": 0.5893, + "step": 8149 + }, + { + "epoch": 3.0084912626138323, + "grad_norm": 0.9068003296852112, + "learning_rate": 2.5116729548120128e-06, + "loss": 0.6013, + "step": 8150 + }, + { + "epoch": 3.008860447944868, + "grad_norm": 0.8864825367927551, + "learning_rate": 2.511186585011551e-06, + "loss": 0.5983, + "step": 8151 + }, + { + "epoch": 3.0092296332759045, + "grad_norm": 0.9195462465286255, + "learning_rate": 2.5107002147876814e-06, + "loss": 0.5824, + "step": 8152 + }, + { + "epoch": 3.009598818606941, + "grad_norm": 0.9359033107757568, + "learning_rate": 2.5102138441588114e-06, + "loss": 0.5816, + "step": 8153 + }, + { + "epoch": 3.009968003937977, + "grad_norm": 0.9131808280944824, + "learning_rate": 2.509727473143351e-06, + "loss": 0.6196, + "step": 8154 + }, + { + "epoch": 3.010337189269013, + "grad_norm": 0.9148750901222229, + "learning_rate": 2.509241101759708e-06, + "loss": 0.5966, + "step": 8155 + }, + { + "epoch": 3.010706374600049, + "grad_norm": 0.9217899441719055, + "learning_rate": 2.508754730026293e-06, + "loss": 0.6061, + "step": 8156 + }, + { + "epoch": 3.0110755599310854, + "grad_norm": 0.9013656973838806, + "learning_rate": 2.5082683579615138e-06, + "loss": 0.5697, + "step": 8157 + }, + { + "epoch": 3.011444745262122, + "grad_norm": 0.9024122357368469, + "learning_rate": 2.50778198558378e-06, + "loss": 0.58, + "step": 8158 + }, + { + "epoch": 3.0118139305931577, + "grad_norm": 0.8878004550933838, + "learning_rate": 2.5072956129115005e-06, + "loss": 0.5822, + "step": 8159 + }, + { + "epoch": 3.012183115924194, + "grad_norm": 0.8956926465034485, + "learning_rate": 2.5068092399630845e-06, + "loss": 0.5694, + "step": 8160 + }, + { + "epoch": 3.01255230125523, + "grad_norm": 0.9186815619468689, + "learning_rate": 2.5063228667569406e-06, + "loss": 0.618, + "step": 8161 + }, + { + "epoch": 3.0129214865862664, + "grad_norm": 0.8784944415092468, + "learning_rate": 2.5058364933114785e-06, + "loss": 0.5324, + "step": 8162 + }, + { + "epoch": 3.0132906719173023, + "grad_norm": 0.9626219272613525, + "learning_rate": 2.505350119645107e-06, + "loss": 0.6245, + "step": 8163 + }, + { + "epoch": 3.0136598572483386, + "grad_norm": 0.8852795362472534, + "learning_rate": 2.504863745776235e-06, + "loss": 0.5522, + "step": 8164 + }, + { + "epoch": 3.014029042579375, + "grad_norm": 0.857429563999176, + "learning_rate": 2.504377371723271e-06, + "loss": 0.5855, + "step": 8165 + }, + { + "epoch": 3.014398227910411, + "grad_norm": 0.925311803817749, + "learning_rate": 2.503890997504626e-06, + "loss": 0.581, + "step": 8166 + }, + { + "epoch": 3.0147674132414473, + "grad_norm": 0.9056406021118164, + "learning_rate": 2.5034046231387075e-06, + "loss": 0.5824, + "step": 8167 + }, + { + "epoch": 3.015136598572483, + "grad_norm": 0.9163872599601746, + "learning_rate": 2.5029182486439254e-06, + "loss": 0.6219, + "step": 8168 + }, + { + "epoch": 3.0155057839035195, + "grad_norm": 0.9264765977859497, + "learning_rate": 2.5024318740386884e-06, + "loss": 0.569, + "step": 8169 + }, + { + "epoch": 3.015874969234556, + "grad_norm": 0.9219174385070801, + "learning_rate": 2.5019454993414056e-06, + "loss": 0.6276, + "step": 8170 + }, + { + "epoch": 3.016244154565592, + "grad_norm": 0.9214531183242798, + "learning_rate": 2.501459124570486e-06, + "loss": 0.6012, + "step": 8171 + }, + { + "epoch": 3.016613339896628, + "grad_norm": 0.8817593455314636, + "learning_rate": 2.500972749744339e-06, + "loss": 0.5612, + "step": 8172 + }, + { + "epoch": 3.016982525227664, + "grad_norm": 0.9294654130935669, + "learning_rate": 2.5004863748813746e-06, + "loss": 0.5936, + "step": 8173 + }, + { + "epoch": 3.0173517105587004, + "grad_norm": 0.8740285634994507, + "learning_rate": 2.5e-06, + "loss": 0.5557, + "step": 8174 + }, + { + "epoch": 3.017720895889737, + "grad_norm": 0.8840826153755188, + "learning_rate": 2.4995136251186263e-06, + "loss": 0.564, + "step": 8175 + }, + { + "epoch": 3.0180900812207727, + "grad_norm": 0.9071206450462341, + "learning_rate": 2.4990272502556613e-06, + "loss": 0.5314, + "step": 8176 + }, + { + "epoch": 3.018459266551809, + "grad_norm": 0.9179890155792236, + "learning_rate": 2.4985408754295148e-06, + "loss": 0.63, + "step": 8177 + }, + { + "epoch": 3.018828451882845, + "grad_norm": 0.9070969820022583, + "learning_rate": 2.4980545006585956e-06, + "loss": 0.6096, + "step": 8178 + }, + { + "epoch": 3.0191976372138813, + "grad_norm": 0.9110747575759888, + "learning_rate": 2.497568125961312e-06, + "loss": 0.5723, + "step": 8179 + }, + { + "epoch": 3.0195668225449177, + "grad_norm": 0.9273945093154907, + "learning_rate": 2.497081751356075e-06, + "loss": 0.5874, + "step": 8180 + }, + { + "epoch": 3.0199360078759536, + "grad_norm": 0.8837785124778748, + "learning_rate": 2.496595376861293e-06, + "loss": 0.5794, + "step": 8181 + }, + { + "epoch": 3.02030519320699, + "grad_norm": 0.9044152498245239, + "learning_rate": 2.4961090024953745e-06, + "loss": 0.6041, + "step": 8182 + }, + { + "epoch": 3.020674378538026, + "grad_norm": 0.9331149458885193, + "learning_rate": 2.4956226282767293e-06, + "loss": 0.6125, + "step": 8183 + }, + { + "epoch": 3.0210435638690623, + "grad_norm": 0.9185389876365662, + "learning_rate": 2.495136254223766e-06, + "loss": 0.6025, + "step": 8184 + }, + { + "epoch": 3.0214127492000986, + "grad_norm": 0.9036067128181458, + "learning_rate": 2.494649880354894e-06, + "loss": 0.5777, + "step": 8185 + }, + { + "epoch": 3.0217819345311345, + "grad_norm": 0.9138175249099731, + "learning_rate": 2.4941635066885223e-06, + "loss": 0.6021, + "step": 8186 + }, + { + "epoch": 3.022151119862171, + "grad_norm": 0.9143818616867065, + "learning_rate": 2.4936771332430602e-06, + "loss": 0.5959, + "step": 8187 + }, + { + "epoch": 3.022520305193207, + "grad_norm": 0.8910555839538574, + "learning_rate": 2.4931907600369167e-06, + "loss": 0.5741, + "step": 8188 + }, + { + "epoch": 3.022889490524243, + "grad_norm": 0.9076423048973083, + "learning_rate": 2.4927043870885e-06, + "loss": 0.5904, + "step": 8189 + }, + { + "epoch": 3.0232586758552795, + "grad_norm": 0.9219629168510437, + "learning_rate": 2.4922180144162205e-06, + "loss": 0.6282, + "step": 8190 + }, + { + "epoch": 3.0236278611863154, + "grad_norm": 0.9185525178909302, + "learning_rate": 2.4917316420384866e-06, + "loss": 0.5732, + "step": 8191 + }, + { + "epoch": 3.023997046517352, + "grad_norm": 1.0878840684890747, + "learning_rate": 2.4912452699737077e-06, + "loss": 0.5929, + "step": 8192 + }, + { + "epoch": 3.0243662318483877, + "grad_norm": 0.8963728547096252, + "learning_rate": 2.4907588982402924e-06, + "loss": 0.5962, + "step": 8193 + }, + { + "epoch": 3.024735417179424, + "grad_norm": 0.9216485023498535, + "learning_rate": 2.4902725268566503e-06, + "loss": 0.5912, + "step": 8194 + }, + { + "epoch": 3.0251046025104604, + "grad_norm": 0.8842558860778809, + "learning_rate": 2.4897861558411894e-06, + "loss": 0.5837, + "step": 8195 + }, + { + "epoch": 3.0254737878414963, + "grad_norm": 0.897105872631073, + "learning_rate": 2.48929978521232e-06, + "loss": 0.5707, + "step": 8196 + }, + { + "epoch": 3.0258429731725327, + "grad_norm": 0.8989336490631104, + "learning_rate": 2.4888134149884493e-06, + "loss": 0.5443, + "step": 8197 + }, + { + "epoch": 3.0262121585035686, + "grad_norm": 0.9094153046607971, + "learning_rate": 2.4883270451879872e-06, + "loss": 0.6421, + "step": 8198 + }, + { + "epoch": 3.026581343834605, + "grad_norm": 0.8738707900047302, + "learning_rate": 2.487840675829344e-06, + "loss": 0.5839, + "step": 8199 + }, + { + "epoch": 3.0269505291656413, + "grad_norm": 0.9411379098892212, + "learning_rate": 2.4873543069309262e-06, + "loss": 0.571, + "step": 8200 + }, + { + "epoch": 3.0273197144966772, + "grad_norm": 0.8870744109153748, + "learning_rate": 2.486867938511145e-06, + "loss": 0.5679, + "step": 8201 + }, + { + "epoch": 3.0276888998277136, + "grad_norm": 0.8897790908813477, + "learning_rate": 2.4863815705884074e-06, + "loss": 0.5707, + "step": 8202 + }, + { + "epoch": 3.0280580851587495, + "grad_norm": 0.9240819215774536, + "learning_rate": 2.485895203181123e-06, + "loss": 0.5974, + "step": 8203 + }, + { + "epoch": 3.028427270489786, + "grad_norm": 0.8983937501907349, + "learning_rate": 2.4854088363077015e-06, + "loss": 0.6147, + "step": 8204 + }, + { + "epoch": 3.0287964558208222, + "grad_norm": 1.0098345279693604, + "learning_rate": 2.4849224699865505e-06, + "loss": 0.6076, + "step": 8205 + }, + { + "epoch": 3.029165641151858, + "grad_norm": 0.8822774291038513, + "learning_rate": 2.4844361042360795e-06, + "loss": 0.5889, + "step": 8206 + }, + { + "epoch": 3.0295348264828945, + "grad_norm": 0.9166756272315979, + "learning_rate": 2.4839497390746966e-06, + "loss": 0.6163, + "step": 8207 + }, + { + "epoch": 3.0299040118139304, + "grad_norm": 0.9263394474983215, + "learning_rate": 2.4834633745208104e-06, + "loss": 0.587, + "step": 8208 + }, + { + "epoch": 3.030273197144967, + "grad_norm": 0.9352772831916809, + "learning_rate": 2.482977010592831e-06, + "loss": 0.5717, + "step": 8209 + }, + { + "epoch": 3.030642382476003, + "grad_norm": 0.8984179496765137, + "learning_rate": 2.4824906473091666e-06, + "loss": 0.6016, + "step": 8210 + }, + { + "epoch": 3.031011567807039, + "grad_norm": 0.8937541246414185, + "learning_rate": 2.4820042846882257e-06, + "loss": 0.5855, + "step": 8211 + }, + { + "epoch": 3.0313807531380754, + "grad_norm": 0.8934330940246582, + "learning_rate": 2.4815179227484166e-06, + "loss": 0.5896, + "step": 8212 + }, + { + "epoch": 3.0317499384691113, + "grad_norm": 0.9079824090003967, + "learning_rate": 2.4810315615081483e-06, + "loss": 0.6044, + "step": 8213 + }, + { + "epoch": 3.0321191238001477, + "grad_norm": 0.9116232991218567, + "learning_rate": 2.48054520098583e-06, + "loss": 0.5917, + "step": 8214 + }, + { + "epoch": 3.032488309131184, + "grad_norm": 0.9126923680305481, + "learning_rate": 2.480058841199869e-06, + "loss": 0.5722, + "step": 8215 + }, + { + "epoch": 3.03285749446222, + "grad_norm": 0.9014302492141724, + "learning_rate": 2.4795724821686755e-06, + "loss": 0.5889, + "step": 8216 + }, + { + "epoch": 3.0332266797932563, + "grad_norm": 0.9253442287445068, + "learning_rate": 2.479086123910656e-06, + "loss": 0.5783, + "step": 8217 + }, + { + "epoch": 3.0335958651242922, + "grad_norm": 0.9044568538665771, + "learning_rate": 2.4785997664442197e-06, + "loss": 0.5954, + "step": 8218 + }, + { + "epoch": 3.0339650504553286, + "grad_norm": 0.9206470847129822, + "learning_rate": 2.478113409787777e-06, + "loss": 0.5817, + "step": 8219 + }, + { + "epoch": 3.034334235786365, + "grad_norm": 0.9326675534248352, + "learning_rate": 2.4776270539597336e-06, + "loss": 0.5952, + "step": 8220 + }, + { + "epoch": 3.034703421117401, + "grad_norm": 0.9146982431411743, + "learning_rate": 2.4771406989785e-06, + "loss": 0.5844, + "step": 8221 + }, + { + "epoch": 3.0350726064484372, + "grad_norm": 0.8934233784675598, + "learning_rate": 2.476654344862483e-06, + "loss": 0.5609, + "step": 8222 + }, + { + "epoch": 3.035441791779473, + "grad_norm": 0.8880756497383118, + "learning_rate": 2.4761679916300923e-06, + "loss": 0.5749, + "step": 8223 + }, + { + "epoch": 3.0358109771105095, + "grad_norm": 0.8832208514213562, + "learning_rate": 2.4756816392997356e-06, + "loss": 0.5748, + "step": 8224 + }, + { + "epoch": 3.036180162441546, + "grad_norm": 0.8907672762870789, + "learning_rate": 2.475195287889821e-06, + "loss": 0.5666, + "step": 8225 + }, + { + "epoch": 3.036549347772582, + "grad_norm": 0.8861114978790283, + "learning_rate": 2.4747089374187573e-06, + "loss": 0.571, + "step": 8226 + }, + { + "epoch": 3.036918533103618, + "grad_norm": 0.9149507880210876, + "learning_rate": 2.474222587904951e-06, + "loss": 0.6073, + "step": 8227 + }, + { + "epoch": 3.037287718434654, + "grad_norm": 0.9191744327545166, + "learning_rate": 2.4737362393668123e-06, + "loss": 0.5849, + "step": 8228 + }, + { + "epoch": 3.0376569037656904, + "grad_norm": 0.9126807451248169, + "learning_rate": 2.4732498918227495e-06, + "loss": 0.5906, + "step": 8229 + }, + { + "epoch": 3.038026089096727, + "grad_norm": 0.9150454998016357, + "learning_rate": 2.472763545291169e-06, + "loss": 0.6213, + "step": 8230 + }, + { + "epoch": 3.0383952744277627, + "grad_norm": 0.94235759973526, + "learning_rate": 2.4722771997904795e-06, + "loss": 0.6057, + "step": 8231 + }, + { + "epoch": 3.038764459758799, + "grad_norm": 0.9222410321235657, + "learning_rate": 2.4717908553390903e-06, + "loss": 0.6032, + "step": 8232 + }, + { + "epoch": 3.039133645089835, + "grad_norm": 0.9039897918701172, + "learning_rate": 2.4713045119554073e-06, + "loss": 0.5891, + "step": 8233 + }, + { + "epoch": 3.0395028304208713, + "grad_norm": 0.8886401057243347, + "learning_rate": 2.4708181696578404e-06, + "loss": 0.5898, + "step": 8234 + }, + { + "epoch": 3.0398720157519072, + "grad_norm": 0.914966881275177, + "learning_rate": 2.4703318284647955e-06, + "loss": 0.6104, + "step": 8235 + }, + { + "epoch": 3.0402412010829436, + "grad_norm": 0.8726264238357544, + "learning_rate": 2.4698454883946817e-06, + "loss": 0.5635, + "step": 8236 + }, + { + "epoch": 3.04061038641398, + "grad_norm": 0.9776650071144104, + "learning_rate": 2.469359149465908e-06, + "loss": 0.5886, + "step": 8237 + }, + { + "epoch": 3.040979571745016, + "grad_norm": 0.9580798745155334, + "learning_rate": 2.468872811696879e-06, + "loss": 0.63, + "step": 8238 + }, + { + "epoch": 3.0413487570760522, + "grad_norm": 0.9198307991027832, + "learning_rate": 2.4683864751060054e-06, + "loss": 0.5942, + "step": 8239 + }, + { + "epoch": 3.041717942407088, + "grad_norm": 0.9147478938102722, + "learning_rate": 2.467900139711693e-06, + "loss": 0.5812, + "step": 8240 + }, + { + "epoch": 3.0420871277381245, + "grad_norm": 0.8855363726615906, + "learning_rate": 2.4674138055323508e-06, + "loss": 0.5612, + "step": 8241 + }, + { + "epoch": 3.042456313069161, + "grad_norm": 0.913031280040741, + "learning_rate": 2.466927472586386e-06, + "loss": 0.6067, + "step": 8242 + }, + { + "epoch": 3.042825498400197, + "grad_norm": 0.8771775364875793, + "learning_rate": 2.466441140892205e-06, + "loss": 0.5489, + "step": 8243 + }, + { + "epoch": 3.043194683731233, + "grad_norm": 0.949759840965271, + "learning_rate": 2.465954810468217e-06, + "loss": 0.5787, + "step": 8244 + }, + { + "epoch": 3.043563869062269, + "grad_norm": 0.8948526978492737, + "learning_rate": 2.4654684813328283e-06, + "loss": 0.6021, + "step": 8245 + }, + { + "epoch": 3.0439330543933054, + "grad_norm": 0.906459391117096, + "learning_rate": 2.4649821535044463e-06, + "loss": 0.6226, + "step": 8246 + }, + { + "epoch": 3.0443022397243418, + "grad_norm": 0.9635511040687561, + "learning_rate": 2.4644958270014803e-06, + "loss": 0.6556, + "step": 8247 + }, + { + "epoch": 3.0446714250553777, + "grad_norm": 0.9131077527999878, + "learning_rate": 2.464009501842334e-06, + "loss": 0.5727, + "step": 8248 + }, + { + "epoch": 3.045040610386414, + "grad_norm": 0.8821119070053101, + "learning_rate": 2.463523178045418e-06, + "loss": 0.5832, + "step": 8249 + }, + { + "epoch": 3.04540979571745, + "grad_norm": 0.9356818795204163, + "learning_rate": 2.463036855629138e-06, + "loss": 0.5918, + "step": 8250 + }, + { + "epoch": 3.0457789810484863, + "grad_norm": 0.8896338939666748, + "learning_rate": 2.4625505346119007e-06, + "loss": 0.5999, + "step": 8251 + }, + { + "epoch": 3.0461481663795227, + "grad_norm": 0.9070318341255188, + "learning_rate": 2.4620642150121147e-06, + "loss": 0.5845, + "step": 8252 + }, + { + "epoch": 3.0465173517105586, + "grad_norm": 0.9237996935844421, + "learning_rate": 2.461577896848186e-06, + "loss": 0.5871, + "step": 8253 + }, + { + "epoch": 3.046886537041595, + "grad_norm": 0.8963157534599304, + "learning_rate": 2.4610915801385216e-06, + "loss": 0.5559, + "step": 8254 + }, + { + "epoch": 3.047255722372631, + "grad_norm": 0.8834442496299744, + "learning_rate": 2.4606052649015288e-06, + "loss": 0.5832, + "step": 8255 + }, + { + "epoch": 3.0476249077036672, + "grad_norm": 0.9140040874481201, + "learning_rate": 2.4601189511556133e-06, + "loss": 0.5756, + "step": 8256 + }, + { + "epoch": 3.0479940930347036, + "grad_norm": 0.8993643522262573, + "learning_rate": 2.4596326389191846e-06, + "loss": 0.5991, + "step": 8257 + }, + { + "epoch": 3.0483632783657395, + "grad_norm": 0.9046819806098938, + "learning_rate": 2.4591463282106463e-06, + "loss": 0.6109, + "step": 8258 + }, + { + "epoch": 3.048732463696776, + "grad_norm": 0.9152814745903015, + "learning_rate": 2.4586600190484077e-06, + "loss": 0.5913, + "step": 8259 + }, + { + "epoch": 3.0491016490278118, + "grad_norm": 0.9029400944709778, + "learning_rate": 2.4581737114508736e-06, + "loss": 0.5841, + "step": 8260 + }, + { + "epoch": 3.049470834358848, + "grad_norm": 0.9163366556167603, + "learning_rate": 2.4576874054364512e-06, + "loss": 0.6227, + "step": 8261 + }, + { + "epoch": 3.0498400196898845, + "grad_norm": 0.921552836894989, + "learning_rate": 2.457201101023548e-06, + "loss": 0.5948, + "step": 8262 + }, + { + "epoch": 3.0502092050209204, + "grad_norm": 0.8904011249542236, + "learning_rate": 2.456714798230569e-06, + "loss": 0.5641, + "step": 8263 + }, + { + "epoch": 3.0505783903519568, + "grad_norm": 0.9280077219009399, + "learning_rate": 2.456228497075922e-06, + "loss": 0.6112, + "step": 8264 + }, + { + "epoch": 3.0509475756829927, + "grad_norm": 0.9131056070327759, + "learning_rate": 2.4557421975780116e-06, + "loss": 0.5724, + "step": 8265 + }, + { + "epoch": 3.051316761014029, + "grad_norm": 0.9220337867736816, + "learning_rate": 2.4552558997552445e-06, + "loss": 0.5959, + "step": 8266 + }, + { + "epoch": 3.0516859463450654, + "grad_norm": 0.9222990274429321, + "learning_rate": 2.4547696036260295e-06, + "loss": 0.5745, + "step": 8267 + }, + { + "epoch": 3.0520551316761013, + "grad_norm": 0.9223809838294983, + "learning_rate": 2.4542833092087685e-06, + "loss": 0.5847, + "step": 8268 + }, + { + "epoch": 3.0524243170071377, + "grad_norm": 0.9259125590324402, + "learning_rate": 2.453797016521871e-06, + "loss": 0.6006, + "step": 8269 + }, + { + "epoch": 3.0527935023381736, + "grad_norm": 0.9123812913894653, + "learning_rate": 2.453310725583742e-06, + "loss": 0.5748, + "step": 8270 + }, + { + "epoch": 3.05316268766921, + "grad_norm": 0.8965858817100525, + "learning_rate": 2.452824436412787e-06, + "loss": 0.5779, + "step": 8271 + }, + { + "epoch": 3.0535318730002463, + "grad_norm": 0.8952454924583435, + "learning_rate": 2.4523381490274125e-06, + "loss": 0.5878, + "step": 8272 + }, + { + "epoch": 3.0539010583312822, + "grad_norm": 0.8780558705329895, + "learning_rate": 2.4518518634460236e-06, + "loss": 0.5668, + "step": 8273 + }, + { + "epoch": 3.0542702436623186, + "grad_norm": 0.8927786946296692, + "learning_rate": 2.451365579687026e-06, + "loss": 0.5714, + "step": 8274 + }, + { + "epoch": 3.0546394289933545, + "grad_norm": 0.9407057762145996, + "learning_rate": 2.450879297768827e-06, + "loss": 0.6252, + "step": 8275 + }, + { + "epoch": 3.055008614324391, + "grad_norm": 0.935664713382721, + "learning_rate": 2.45039301770983e-06, + "loss": 0.5889, + "step": 8276 + }, + { + "epoch": 3.055377799655427, + "grad_norm": 0.9325358867645264, + "learning_rate": 2.449906739528443e-06, + "loss": 0.5948, + "step": 8277 + }, + { + "epoch": 3.055746984986463, + "grad_norm": 0.9058382511138916, + "learning_rate": 2.449420463243069e-06, + "loss": 0.582, + "step": 8278 + }, + { + "epoch": 3.0561161703174995, + "grad_norm": 0.9461725950241089, + "learning_rate": 2.4489341888721143e-06, + "loss": 0.6167, + "step": 8279 + }, + { + "epoch": 3.0564853556485354, + "grad_norm": 0.9225608706474304, + "learning_rate": 2.4484479164339857e-06, + "loss": 0.5725, + "step": 8280 + }, + { + "epoch": 3.0568545409795718, + "grad_norm": 0.9153626561164856, + "learning_rate": 2.447961645947086e-06, + "loss": 0.5899, + "step": 8281 + }, + { + "epoch": 3.057223726310608, + "grad_norm": 0.9231659770011902, + "learning_rate": 2.4474753774298224e-06, + "loss": 0.5929, + "step": 8282 + }, + { + "epoch": 3.057592911641644, + "grad_norm": 0.9297613501548767, + "learning_rate": 2.4469891109005986e-06, + "loss": 0.5929, + "step": 8283 + }, + { + "epoch": 3.0579620969726804, + "grad_norm": 0.8935356140136719, + "learning_rate": 2.4465028463778196e-06, + "loss": 0.5999, + "step": 8284 + }, + { + "epoch": 3.0583312823037163, + "grad_norm": 0.9124206304550171, + "learning_rate": 2.446016583879893e-06, + "loss": 0.5853, + "step": 8285 + }, + { + "epoch": 3.0587004676347527, + "grad_norm": 0.9093139171600342, + "learning_rate": 2.4455303234252196e-06, + "loss": 0.6077, + "step": 8286 + }, + { + "epoch": 3.059069652965789, + "grad_norm": 0.9029320478439331, + "learning_rate": 2.445044065032208e-06, + "loss": 0.5959, + "step": 8287 + }, + { + "epoch": 3.059438838296825, + "grad_norm": 0.9139606356620789, + "learning_rate": 2.4445578087192594e-06, + "loss": 0.6114, + "step": 8288 + }, + { + "epoch": 3.0598080236278613, + "grad_norm": 0.9167805314064026, + "learning_rate": 2.444071554504781e-06, + "loss": 0.6263, + "step": 8289 + }, + { + "epoch": 3.060177208958897, + "grad_norm": 0.9221027493476868, + "learning_rate": 2.4435853024071772e-06, + "loss": 0.5948, + "step": 8290 + }, + { + "epoch": 3.0605463942899336, + "grad_norm": 0.9238643050193787, + "learning_rate": 2.443099052444851e-06, + "loss": 0.5688, + "step": 8291 + }, + { + "epoch": 3.06091557962097, + "grad_norm": 0.8866807222366333, + "learning_rate": 2.442612804636209e-06, + "loss": 0.5669, + "step": 8292 + }, + { + "epoch": 3.061284764952006, + "grad_norm": 0.878009021282196, + "learning_rate": 2.442126558999653e-06, + "loss": 0.561, + "step": 8293 + }, + { + "epoch": 3.061653950283042, + "grad_norm": 0.9051724672317505, + "learning_rate": 2.441640315553588e-06, + "loss": 0.6067, + "step": 8294 + }, + { + "epoch": 3.062023135614078, + "grad_norm": 0.9139344692230225, + "learning_rate": 2.4411540743164207e-06, + "loss": 0.5879, + "step": 8295 + }, + { + "epoch": 3.0623923209451145, + "grad_norm": 0.9098366498947144, + "learning_rate": 2.440667835306551e-06, + "loss": 0.5847, + "step": 8296 + }, + { + "epoch": 3.062761506276151, + "grad_norm": 0.9177896976470947, + "learning_rate": 2.4401815985423873e-06, + "loss": 0.6342, + "step": 8297 + }, + { + "epoch": 3.0631306916071868, + "grad_norm": 0.9005759954452515, + "learning_rate": 2.4396953640423286e-06, + "loss": 0.5708, + "step": 8298 + }, + { + "epoch": 3.063499876938223, + "grad_norm": 0.9130109548568726, + "learning_rate": 2.4392091318247827e-06, + "loss": 0.601, + "step": 8299 + }, + { + "epoch": 3.063869062269259, + "grad_norm": 0.9311850070953369, + "learning_rate": 2.438722901908152e-06, + "loss": 0.598, + "step": 8300 + }, + { + "epoch": 3.0642382476002954, + "grad_norm": 0.9173158407211304, + "learning_rate": 2.43823667431084e-06, + "loss": 0.5869, + "step": 8301 + }, + { + "epoch": 3.0646074329313313, + "grad_norm": 0.9091874957084656, + "learning_rate": 2.437750449051251e-06, + "loss": 0.5909, + "step": 8302 + }, + { + "epoch": 3.0649766182623677, + "grad_norm": 0.9073666930198669, + "learning_rate": 2.437264226147787e-06, + "loss": 0.5995, + "step": 8303 + }, + { + "epoch": 3.065345803593404, + "grad_norm": 0.8933355212211609, + "learning_rate": 2.4367780056188514e-06, + "loss": 0.58, + "step": 8304 + }, + { + "epoch": 3.06571498892444, + "grad_norm": 0.8923709392547607, + "learning_rate": 2.4362917874828505e-06, + "loss": 0.5852, + "step": 8305 + }, + { + "epoch": 3.0660841742554763, + "grad_norm": 0.8672991991043091, + "learning_rate": 2.4358055717581835e-06, + "loss": 0.5679, + "step": 8306 + }, + { + "epoch": 3.066453359586512, + "grad_norm": 0.9042182564735413, + "learning_rate": 2.435319358463256e-06, + "loss": 0.5851, + "step": 8307 + }, + { + "epoch": 3.0668225449175486, + "grad_norm": 0.8981829285621643, + "learning_rate": 2.4348331476164706e-06, + "loss": 0.5691, + "step": 8308 + }, + { + "epoch": 3.067191730248585, + "grad_norm": 0.9206213355064392, + "learning_rate": 2.4343469392362297e-06, + "loss": 0.6133, + "step": 8309 + }, + { + "epoch": 3.067560915579621, + "grad_norm": 0.9243730306625366, + "learning_rate": 2.433860733340937e-06, + "loss": 0.5897, + "step": 8310 + }, + { + "epoch": 3.067930100910657, + "grad_norm": 0.9383952617645264, + "learning_rate": 2.4333745299489942e-06, + "loss": 0.6057, + "step": 8311 + }, + { + "epoch": 3.068299286241693, + "grad_norm": 0.9055639505386353, + "learning_rate": 2.4328883290788037e-06, + "loss": 0.5763, + "step": 8312 + }, + { + "epoch": 3.0686684715727295, + "grad_norm": 0.9097387790679932, + "learning_rate": 2.4324021307487704e-06, + "loss": 0.5869, + "step": 8313 + }, + { + "epoch": 3.069037656903766, + "grad_norm": 0.9037356972694397, + "learning_rate": 2.4319159349772934e-06, + "loss": 0.591, + "step": 8314 + }, + { + "epoch": 3.0694068422348018, + "grad_norm": 0.9203044176101685, + "learning_rate": 2.4314297417827786e-06, + "loss": 0.6193, + "step": 8315 + }, + { + "epoch": 3.069776027565838, + "grad_norm": 0.9074604511260986, + "learning_rate": 2.430943551183625e-06, + "loss": 0.5552, + "step": 8316 + }, + { + "epoch": 3.070145212896874, + "grad_norm": 0.9088731408119202, + "learning_rate": 2.4304573631982365e-06, + "loss": 0.5827, + "step": 8317 + }, + { + "epoch": 3.0705143982279104, + "grad_norm": 0.8938523530960083, + "learning_rate": 2.4299711778450153e-06, + "loss": 0.5772, + "step": 8318 + }, + { + "epoch": 3.0708835835589468, + "grad_norm": 0.9046451449394226, + "learning_rate": 2.4294849951423626e-06, + "loss": 0.599, + "step": 8319 + }, + { + "epoch": 3.0712527688899827, + "grad_norm": 0.9134471416473389, + "learning_rate": 2.4289988151086807e-06, + "loss": 0.5901, + "step": 8320 + }, + { + "epoch": 3.071621954221019, + "grad_norm": 0.9327409863471985, + "learning_rate": 2.428512637762371e-06, + "loss": 0.5942, + "step": 8321 + }, + { + "epoch": 3.071991139552055, + "grad_norm": 0.9063553214073181, + "learning_rate": 2.428026463121835e-06, + "loss": 0.5867, + "step": 8322 + }, + { + "epoch": 3.0723603248830913, + "grad_norm": 0.9690456986427307, + "learning_rate": 2.427540291205476e-06, + "loss": 0.6083, + "step": 8323 + }, + { + "epoch": 3.0727295102141277, + "grad_norm": 0.9011105895042419, + "learning_rate": 2.4270541220316924e-06, + "loss": 0.5735, + "step": 8324 + }, + { + "epoch": 3.0730986955451636, + "grad_norm": 0.9289100170135498, + "learning_rate": 2.4265679556188894e-06, + "loss": 0.564, + "step": 8325 + }, + { + "epoch": 3.0734678808762, + "grad_norm": 0.8956310153007507, + "learning_rate": 2.426081791985464e-06, + "loss": 0.5839, + "step": 8326 + }, + { + "epoch": 3.073837066207236, + "grad_norm": 0.8941949605941772, + "learning_rate": 2.42559563114982e-06, + "loss": 0.5746, + "step": 8327 + }, + { + "epoch": 3.074206251538272, + "grad_norm": 0.923061728477478, + "learning_rate": 2.4251094731303586e-06, + "loss": 0.5716, + "step": 8328 + }, + { + "epoch": 3.0745754368693086, + "grad_norm": 0.9209542870521545, + "learning_rate": 2.4246233179454796e-06, + "loss": 0.5764, + "step": 8329 + }, + { + "epoch": 3.0749446222003445, + "grad_norm": 0.8818815350532532, + "learning_rate": 2.4241371656135846e-06, + "loss": 0.5886, + "step": 8330 + }, + { + "epoch": 3.075313807531381, + "grad_norm": 0.9004179239273071, + "learning_rate": 2.423651016153073e-06, + "loss": 0.6035, + "step": 8331 + }, + { + "epoch": 3.0756829928624168, + "grad_norm": 0.8966555595397949, + "learning_rate": 2.4231648695823465e-06, + "loss": 0.5956, + "step": 8332 + }, + { + "epoch": 3.076052178193453, + "grad_norm": 0.9288646578788757, + "learning_rate": 2.4226787259198065e-06, + "loss": 0.634, + "step": 8333 + }, + { + "epoch": 3.0764213635244895, + "grad_norm": 0.9087578654289246, + "learning_rate": 2.4221925851838514e-06, + "loss": 0.562, + "step": 8334 + }, + { + "epoch": 3.0767905488555254, + "grad_norm": 0.9141826629638672, + "learning_rate": 2.4217064473928833e-06, + "loss": 0.576, + "step": 8335 + }, + { + "epoch": 3.0771597341865617, + "grad_norm": 0.9358482360839844, + "learning_rate": 2.4212203125653e-06, + "loss": 0.5989, + "step": 8336 + }, + { + "epoch": 3.0775289195175977, + "grad_norm": 0.9208990931510925, + "learning_rate": 2.4207341807195035e-06, + "loss": 0.5767, + "step": 8337 + }, + { + "epoch": 3.077898104848634, + "grad_norm": 0.9040857553482056, + "learning_rate": 2.420248051873894e-06, + "loss": 0.5527, + "step": 8338 + }, + { + "epoch": 3.0782672901796704, + "grad_norm": 0.8976907730102539, + "learning_rate": 2.4197619260468704e-06, + "loss": 0.5711, + "step": 8339 + }, + { + "epoch": 3.0786364755107063, + "grad_norm": 0.9195444583892822, + "learning_rate": 2.4192758032568325e-06, + "loss": 0.5824, + "step": 8340 + }, + { + "epoch": 3.0790056608417427, + "grad_norm": 0.895233690738678, + "learning_rate": 2.4187896835221793e-06, + "loss": 0.555, + "step": 8341 + }, + { + "epoch": 3.0793748461727786, + "grad_norm": 0.9136778116226196, + "learning_rate": 2.4183035668613108e-06, + "loss": 0.6184, + "step": 8342 + }, + { + "epoch": 3.079744031503815, + "grad_norm": 0.9530792832374573, + "learning_rate": 2.4178174532926283e-06, + "loss": 0.5913, + "step": 8343 + }, + { + "epoch": 3.0801132168348513, + "grad_norm": 0.9161463975906372, + "learning_rate": 2.417331342834527e-06, + "loss": 0.6049, + "step": 8344 + }, + { + "epoch": 3.080482402165887, + "grad_norm": 0.9100618958473206, + "learning_rate": 2.4168452355054095e-06, + "loss": 0.5925, + "step": 8345 + }, + { + "epoch": 3.0808515874969236, + "grad_norm": 0.9295870661735535, + "learning_rate": 2.4163591313236737e-06, + "loss": 0.5746, + "step": 8346 + }, + { + "epoch": 3.0812207728279595, + "grad_norm": 0.9195476770401001, + "learning_rate": 2.415873030307718e-06, + "loss": 0.5837, + "step": 8347 + }, + { + "epoch": 3.081589958158996, + "grad_norm": 1.0071697235107422, + "learning_rate": 2.415386932475942e-06, + "loss": 0.6168, + "step": 8348 + }, + { + "epoch": 3.081959143490032, + "grad_norm": 0.9120000600814819, + "learning_rate": 2.4149008378467436e-06, + "loss": 0.6012, + "step": 8349 + }, + { + "epoch": 3.082328328821068, + "grad_norm": 0.9439469575881958, + "learning_rate": 2.4144147464385213e-06, + "loss": 0.6069, + "step": 8350 + }, + { + "epoch": 3.0826975141521045, + "grad_norm": 0.9622184038162231, + "learning_rate": 2.413928658269675e-06, + "loss": 0.6259, + "step": 8351 + }, + { + "epoch": 3.0830666994831404, + "grad_norm": 0.9198629260063171, + "learning_rate": 2.4134425733585997e-06, + "loss": 0.5778, + "step": 8352 + }, + { + "epoch": 3.0834358848141767, + "grad_norm": 0.9197922945022583, + "learning_rate": 2.412956491723698e-06, + "loss": 0.6197, + "step": 8353 + }, + { + "epoch": 3.083805070145213, + "grad_norm": 0.9323838353157043, + "learning_rate": 2.412470413383364e-06, + "loss": 0.5771, + "step": 8354 + }, + { + "epoch": 3.084174255476249, + "grad_norm": 0.9143050909042358, + "learning_rate": 2.4119843383559975e-06, + "loss": 0.5964, + "step": 8355 + }, + { + "epoch": 3.0845434408072854, + "grad_norm": 0.9141031503677368, + "learning_rate": 2.4114982666599966e-06, + "loss": 0.5498, + "step": 8356 + }, + { + "epoch": 3.0849126261383213, + "grad_norm": 0.927799642086029, + "learning_rate": 2.411012198313758e-06, + "loss": 0.6258, + "step": 8357 + }, + { + "epoch": 3.0852818114693576, + "grad_norm": 0.9068622589111328, + "learning_rate": 2.4105261333356803e-06, + "loss": 0.5606, + "step": 8358 + }, + { + "epoch": 3.085650996800394, + "grad_norm": 0.9219293594360352, + "learning_rate": 2.4100400717441595e-06, + "loss": 0.6181, + "step": 8359 + }, + { + "epoch": 3.08602018213143, + "grad_norm": 0.8787194490432739, + "learning_rate": 2.4095540135575935e-06, + "loss": 0.5753, + "step": 8360 + }, + { + "epoch": 3.0863893674624663, + "grad_norm": 0.8950429558753967, + "learning_rate": 2.40906795879438e-06, + "loss": 0.57, + "step": 8361 + }, + { + "epoch": 3.086758552793502, + "grad_norm": 0.8923251628875732, + "learning_rate": 2.408581907472915e-06, + "loss": 0.6055, + "step": 8362 + }, + { + "epoch": 3.0871277381245386, + "grad_norm": 0.9400135278701782, + "learning_rate": 2.408095859611597e-06, + "loss": 0.5831, + "step": 8363 + }, + { + "epoch": 3.087496923455575, + "grad_norm": 0.9195470809936523, + "learning_rate": 2.4076098152288198e-06, + "loss": 0.5673, + "step": 8364 + }, + { + "epoch": 3.087866108786611, + "grad_norm": 0.9258822202682495, + "learning_rate": 2.407123774342983e-06, + "loss": 0.5812, + "step": 8365 + }, + { + "epoch": 3.088235294117647, + "grad_norm": 0.9185774922370911, + "learning_rate": 2.406637736972483e-06, + "loss": 0.596, + "step": 8366 + }, + { + "epoch": 3.088604479448683, + "grad_norm": 0.9008165597915649, + "learning_rate": 2.406151703135714e-06, + "loss": 0.5956, + "step": 8367 + }, + { + "epoch": 3.0889736647797195, + "grad_norm": 0.9388812184333801, + "learning_rate": 2.4056656728510742e-06, + "loss": 0.6116, + "step": 8368 + }, + { + "epoch": 3.0893428501107554, + "grad_norm": 0.9180893898010254, + "learning_rate": 2.405179646136958e-06, + "loss": 0.5696, + "step": 8369 + }, + { + "epoch": 3.0897120354417917, + "grad_norm": 0.9060349464416504, + "learning_rate": 2.4046936230117626e-06, + "loss": 0.5807, + "step": 8370 + }, + { + "epoch": 3.090081220772828, + "grad_norm": 0.9302439093589783, + "learning_rate": 2.404207603493884e-06, + "loss": 0.6102, + "step": 8371 + }, + { + "epoch": 3.090450406103864, + "grad_norm": 0.8982405662536621, + "learning_rate": 2.403721587601716e-06, + "loss": 0.589, + "step": 8372 + }, + { + "epoch": 3.0908195914349004, + "grad_norm": 0.9119776487350464, + "learning_rate": 2.4032355753536575e-06, + "loss": 0.5874, + "step": 8373 + }, + { + "epoch": 3.0911887767659363, + "grad_norm": 0.9152305722236633, + "learning_rate": 2.4027495667681004e-06, + "loss": 0.6256, + "step": 8374 + }, + { + "epoch": 3.0915579620969726, + "grad_norm": 0.9128805994987488, + "learning_rate": 2.402263561863442e-06, + "loss": 0.5946, + "step": 8375 + }, + { + "epoch": 3.091927147428009, + "grad_norm": 0.897714376449585, + "learning_rate": 2.401777560658077e-06, + "loss": 0.5804, + "step": 8376 + }, + { + "epoch": 3.092296332759045, + "grad_norm": 0.9007841348648071, + "learning_rate": 2.4012915631704004e-06, + "loss": 0.6215, + "step": 8377 + }, + { + "epoch": 3.0926655180900813, + "grad_norm": 0.9379267692565918, + "learning_rate": 2.4008055694188067e-06, + "loss": 0.6053, + "step": 8378 + }, + { + "epoch": 3.093034703421117, + "grad_norm": 0.9022829532623291, + "learning_rate": 2.4003195794216916e-06, + "loss": 0.608, + "step": 8379 + }, + { + "epoch": 3.0934038887521536, + "grad_norm": 0.9197754263877869, + "learning_rate": 2.399833593197448e-06, + "loss": 0.619, + "step": 8380 + }, + { + "epoch": 3.09377307408319, + "grad_norm": 0.9193922877311707, + "learning_rate": 2.3993476107644724e-06, + "loss": 0.6123, + "step": 8381 + }, + { + "epoch": 3.094142259414226, + "grad_norm": 0.8935986757278442, + "learning_rate": 2.3988616321411565e-06, + "loss": 0.6095, + "step": 8382 + }, + { + "epoch": 3.094511444745262, + "grad_norm": 0.9211336970329285, + "learning_rate": 2.398375657345897e-06, + "loss": 0.5705, + "step": 8383 + }, + { + "epoch": 3.094880630076298, + "grad_norm": 0.9254179000854492, + "learning_rate": 2.397889686397086e-06, + "loss": 0.6114, + "step": 8384 + }, + { + "epoch": 3.0952498154073345, + "grad_norm": 0.9569666385650635, + "learning_rate": 2.397403719313119e-06, + "loss": 0.5724, + "step": 8385 + }, + { + "epoch": 3.095619000738371, + "grad_norm": 0.9234176874160767, + "learning_rate": 2.396917756112389e-06, + "loss": 0.5765, + "step": 8386 + }, + { + "epoch": 3.0959881860694067, + "grad_norm": 0.9261723160743713, + "learning_rate": 2.3964317968132886e-06, + "loss": 0.5799, + "step": 8387 + }, + { + "epoch": 3.096357371400443, + "grad_norm": 0.912843644618988, + "learning_rate": 2.395945841434212e-06, + "loss": 0.5998, + "step": 8388 + }, + { + "epoch": 3.096726556731479, + "grad_norm": 0.8938929438591003, + "learning_rate": 2.3954598899935534e-06, + "loss": 0.5741, + "step": 8389 + }, + { + "epoch": 3.0970957420625154, + "grad_norm": 0.9112582802772522, + "learning_rate": 2.394973942509704e-06, + "loss": 0.6047, + "step": 8390 + }, + { + "epoch": 3.0974649273935517, + "grad_norm": 0.9069494605064392, + "learning_rate": 2.3944879990010583e-06, + "loss": 0.5828, + "step": 8391 + }, + { + "epoch": 3.0978341127245876, + "grad_norm": 0.9416301846504211, + "learning_rate": 2.3940020594860074e-06, + "loss": 0.6025, + "step": 8392 + }, + { + "epoch": 3.098203298055624, + "grad_norm": 0.9294349551200867, + "learning_rate": 2.393516123982946e-06, + "loss": 0.5995, + "step": 8393 + }, + { + "epoch": 3.09857248338666, + "grad_norm": 0.8687685132026672, + "learning_rate": 2.3930301925102662e-06, + "loss": 0.5529, + "step": 8394 + }, + { + "epoch": 3.0989416687176963, + "grad_norm": 0.899652898311615, + "learning_rate": 2.392544265086359e-06, + "loss": 0.6099, + "step": 8395 + }, + { + "epoch": 3.0993108540487326, + "grad_norm": 0.9761612415313721, + "learning_rate": 2.392058341729618e-06, + "loss": 0.5747, + "step": 8396 + }, + { + "epoch": 3.0996800393797685, + "grad_norm": 0.9386091232299805, + "learning_rate": 2.3915724224584344e-06, + "loss": 0.6304, + "step": 8397 + }, + { + "epoch": 3.100049224710805, + "grad_norm": 0.921320378780365, + "learning_rate": 2.3910865072912003e-06, + "loss": 0.5771, + "step": 8398 + }, + { + "epoch": 3.100418410041841, + "grad_norm": 0.9103705286979675, + "learning_rate": 2.3906005962463084e-06, + "loss": 0.5957, + "step": 8399 + }, + { + "epoch": 3.100787595372877, + "grad_norm": 0.9360268115997314, + "learning_rate": 2.390114689342148e-06, + "loss": 0.5949, + "step": 8400 + }, + { + "epoch": 3.1011567807039135, + "grad_norm": 0.9109243750572205, + "learning_rate": 2.3896287865971133e-06, + "loss": 0.5908, + "step": 8401 + }, + { + "epoch": 3.1015259660349495, + "grad_norm": 0.9429595470428467, + "learning_rate": 2.3891428880295926e-06, + "loss": 0.5993, + "step": 8402 + }, + { + "epoch": 3.101895151365986, + "grad_norm": 0.9062892198562622, + "learning_rate": 2.3886569936579797e-06, + "loss": 0.5636, + "step": 8403 + }, + { + "epoch": 3.1022643366970217, + "grad_norm": 0.9100238680839539, + "learning_rate": 2.388171103500665e-06, + "loss": 0.57, + "step": 8404 + }, + { + "epoch": 3.102633522028058, + "grad_norm": 0.907963752746582, + "learning_rate": 2.387685217576037e-06, + "loss": 0.6299, + "step": 8405 + }, + { + "epoch": 3.1030027073590944, + "grad_norm": 0.930798351764679, + "learning_rate": 2.38719933590249e-06, + "loss": 0.5876, + "step": 8406 + }, + { + "epoch": 3.1033718926901304, + "grad_norm": 0.9438997507095337, + "learning_rate": 2.3867134584984114e-06, + "loss": 0.6228, + "step": 8407 + }, + { + "epoch": 3.1037410780211667, + "grad_norm": 0.8991132378578186, + "learning_rate": 2.3862275853821924e-06, + "loss": 0.5833, + "step": 8408 + }, + { + "epoch": 3.1041102633522026, + "grad_norm": 0.9155349135398865, + "learning_rate": 2.3857417165722245e-06, + "loss": 0.6003, + "step": 8409 + }, + { + "epoch": 3.104479448683239, + "grad_norm": 0.8807457685470581, + "learning_rate": 2.385255852086895e-06, + "loss": 0.5825, + "step": 8410 + }, + { + "epoch": 3.1048486340142754, + "grad_norm": 0.8682147264480591, + "learning_rate": 2.3847699919445973e-06, + "loss": 0.5767, + "step": 8411 + }, + { + "epoch": 3.1052178193453113, + "grad_norm": 0.9035055637359619, + "learning_rate": 2.3842841361637173e-06, + "loss": 0.5821, + "step": 8412 + }, + { + "epoch": 3.1055870046763476, + "grad_norm": 0.8847358822822571, + "learning_rate": 2.383798284762647e-06, + "loss": 0.5947, + "step": 8413 + }, + { + "epoch": 3.1059561900073835, + "grad_norm": 0.9107963442802429, + "learning_rate": 2.383312437759775e-06, + "loss": 0.6012, + "step": 8414 + }, + { + "epoch": 3.10632537533842, + "grad_norm": 0.9082841277122498, + "learning_rate": 2.38282659517349e-06, + "loss": 0.5752, + "step": 8415 + }, + { + "epoch": 3.1066945606694563, + "grad_norm": 0.9268093705177307, + "learning_rate": 2.3823407570221812e-06, + "loss": 0.596, + "step": 8416 + }, + { + "epoch": 3.107063746000492, + "grad_norm": 0.9144713878631592, + "learning_rate": 2.3818549233242384e-06, + "loss": 0.6049, + "step": 8417 + }, + { + "epoch": 3.1074329313315285, + "grad_norm": 0.9330682158470154, + "learning_rate": 2.3813690940980492e-06, + "loss": 0.5632, + "step": 8418 + }, + { + "epoch": 3.1078021166625645, + "grad_norm": 0.9222251772880554, + "learning_rate": 2.3808832693620025e-06, + "loss": 0.6291, + "step": 8419 + }, + { + "epoch": 3.108171301993601, + "grad_norm": 0.909504234790802, + "learning_rate": 2.3803974491344856e-06, + "loss": 0.602, + "step": 8420 + }, + { + "epoch": 3.108540487324637, + "grad_norm": 0.9405035376548767, + "learning_rate": 2.379911633433888e-06, + "loss": 0.6042, + "step": 8421 + }, + { + "epoch": 3.108909672655673, + "grad_norm": 0.949799656867981, + "learning_rate": 2.3794258222785975e-06, + "loss": 0.568, + "step": 8422 + }, + { + "epoch": 3.1092788579867094, + "grad_norm": 0.9112414717674255, + "learning_rate": 2.378940015687001e-06, + "loss": 0.6118, + "step": 8423 + }, + { + "epoch": 3.1096480433177454, + "grad_norm": 0.8707449436187744, + "learning_rate": 2.3784542136774874e-06, + "loss": 0.5444, + "step": 8424 + }, + { + "epoch": 3.1100172286487817, + "grad_norm": 0.8872550129890442, + "learning_rate": 2.377968416268443e-06, + "loss": 0.5968, + "step": 8425 + }, + { + "epoch": 3.110386413979818, + "grad_norm": 0.9240366816520691, + "learning_rate": 2.3774826234782554e-06, + "loss": 0.5999, + "step": 8426 + }, + { + "epoch": 3.110755599310854, + "grad_norm": 0.8928487300872803, + "learning_rate": 2.3769968353253127e-06, + "loss": 0.5645, + "step": 8427 + }, + { + "epoch": 3.1111247846418904, + "grad_norm": 0.9100751876831055, + "learning_rate": 2.376511051828e-06, + "loss": 0.5816, + "step": 8428 + }, + { + "epoch": 3.1114939699729263, + "grad_norm": 0.9335805177688599, + "learning_rate": 2.3760252730047056e-06, + "loss": 0.6073, + "step": 8429 + }, + { + "epoch": 3.1118631553039626, + "grad_norm": 0.8811178803443909, + "learning_rate": 2.3755394988738143e-06, + "loss": 0.5334, + "step": 8430 + }, + { + "epoch": 3.112232340634999, + "grad_norm": 0.9062147736549377, + "learning_rate": 2.375053729453715e-06, + "loss": 0.595, + "step": 8431 + }, + { + "epoch": 3.112601525966035, + "grad_norm": 0.9002568125724792, + "learning_rate": 2.374567964762792e-06, + "loss": 0.5845, + "step": 8432 + }, + { + "epoch": 3.1129707112970713, + "grad_norm": 0.9122824668884277, + "learning_rate": 2.3740822048194325e-06, + "loss": 0.6018, + "step": 8433 + }, + { + "epoch": 3.113339896628107, + "grad_norm": 0.8831886053085327, + "learning_rate": 2.3735964496420215e-06, + "loss": 0.5618, + "step": 8434 + }, + { + "epoch": 3.1137090819591435, + "grad_norm": 0.9140695929527283, + "learning_rate": 2.373110699248945e-06, + "loss": 0.5982, + "step": 8435 + }, + { + "epoch": 3.1140782672901794, + "grad_norm": 0.9007919430732727, + "learning_rate": 2.372624953658588e-06, + "loss": 0.5886, + "step": 8436 + }, + { + "epoch": 3.114447452621216, + "grad_norm": 0.9184356927871704, + "learning_rate": 2.3721392128893376e-06, + "loss": 0.5898, + "step": 8437 + }, + { + "epoch": 3.114816637952252, + "grad_norm": 0.8915714621543884, + "learning_rate": 2.3716534769595764e-06, + "loss": 0.6011, + "step": 8438 + }, + { + "epoch": 3.115185823283288, + "grad_norm": 0.88692307472229, + "learning_rate": 2.3711677458876914e-06, + "loss": 0.5715, + "step": 8439 + }, + { + "epoch": 3.1155550086143244, + "grad_norm": 0.9344249963760376, + "learning_rate": 2.3706820196920654e-06, + "loss": 0.5957, + "step": 8440 + }, + { + "epoch": 3.115924193945361, + "grad_norm": 0.9012224674224854, + "learning_rate": 2.370196298391085e-06, + "loss": 0.6096, + "step": 8441 + }, + { + "epoch": 3.1162933792763967, + "grad_norm": 0.8949618935585022, + "learning_rate": 2.3697105820031336e-06, + "loss": 0.6171, + "step": 8442 + }, + { + "epoch": 3.116662564607433, + "grad_norm": 0.9109833240509033, + "learning_rate": 2.3692248705465958e-06, + "loss": 0.6122, + "step": 8443 + }, + { + "epoch": 3.117031749938469, + "grad_norm": 0.9324072599411011, + "learning_rate": 2.368739164039855e-06, + "loss": 0.5857, + "step": 8444 + }, + { + "epoch": 3.1174009352695053, + "grad_norm": 0.9023732542991638, + "learning_rate": 2.3682534625012955e-06, + "loss": 0.5487, + "step": 8445 + }, + { + "epoch": 3.1177701206005413, + "grad_norm": 0.8767456412315369, + "learning_rate": 2.367767765949301e-06, + "loss": 0.5652, + "step": 8446 + }, + { + "epoch": 3.1181393059315776, + "grad_norm": 0.9188612699508667, + "learning_rate": 2.3672820744022553e-06, + "loss": 0.5414, + "step": 8447 + }, + { + "epoch": 3.118508491262614, + "grad_norm": 0.8611406683921814, + "learning_rate": 2.36679638787854e-06, + "loss": 0.5453, + "step": 8448 + }, + { + "epoch": 3.11887767659365, + "grad_norm": 0.8994858860969543, + "learning_rate": 2.366310706396541e-06, + "loss": 0.5585, + "step": 8449 + }, + { + "epoch": 3.1192468619246863, + "grad_norm": 0.8991490602493286, + "learning_rate": 2.3658250299746375e-06, + "loss": 0.5751, + "step": 8450 + }, + { + "epoch": 3.119616047255722, + "grad_norm": 0.8986338973045349, + "learning_rate": 2.365339358631215e-06, + "loss": 0.5788, + "step": 8451 + }, + { + "epoch": 3.1199852325867585, + "grad_norm": 0.9019541144371033, + "learning_rate": 2.3648536923846565e-06, + "loss": 0.6117, + "step": 8452 + }, + { + "epoch": 3.120354417917795, + "grad_norm": 0.9477744102478027, + "learning_rate": 2.364368031253342e-06, + "loss": 0.5632, + "step": 8453 + }, + { + "epoch": 3.120723603248831, + "grad_norm": 0.9126050472259521, + "learning_rate": 2.363882375255655e-06, + "loss": 0.5857, + "step": 8454 + }, + { + "epoch": 3.121092788579867, + "grad_norm": 0.9294050335884094, + "learning_rate": 2.363396724409977e-06, + "loss": 0.5949, + "step": 8455 + }, + { + "epoch": 3.121461973910903, + "grad_norm": 0.9364838004112244, + "learning_rate": 2.3629110787346897e-06, + "loss": 0.5716, + "step": 8456 + }, + { + "epoch": 3.1218311592419394, + "grad_norm": 0.8912398815155029, + "learning_rate": 2.3624254382481758e-06, + "loss": 0.5783, + "step": 8457 + }, + { + "epoch": 3.122200344572976, + "grad_norm": 0.9129021167755127, + "learning_rate": 2.3619398029688146e-06, + "loss": 0.6115, + "step": 8458 + }, + { + "epoch": 3.1225695299040117, + "grad_norm": 0.937690019607544, + "learning_rate": 2.361454172914988e-06, + "loss": 0.5993, + "step": 8459 + }, + { + "epoch": 3.122938715235048, + "grad_norm": 0.90199214220047, + "learning_rate": 2.360968548105078e-06, + "loss": 0.5911, + "step": 8460 + }, + { + "epoch": 3.123307900566084, + "grad_norm": 0.9203724265098572, + "learning_rate": 2.360482928557464e-06, + "loss": 0.5992, + "step": 8461 + }, + { + "epoch": 3.1236770858971203, + "grad_norm": 0.9192022085189819, + "learning_rate": 2.359997314290528e-06, + "loss": 0.6007, + "step": 8462 + }, + { + "epoch": 3.1240462712281567, + "grad_norm": 0.9111342430114746, + "learning_rate": 2.359511705322649e-06, + "loss": 0.5787, + "step": 8463 + }, + { + "epoch": 3.1244154565591926, + "grad_norm": 0.900641143321991, + "learning_rate": 2.3590261016722074e-06, + "loss": 0.567, + "step": 8464 + }, + { + "epoch": 3.124784641890229, + "grad_norm": 0.9171278476715088, + "learning_rate": 2.358540503357584e-06, + "loss": 0.6079, + "step": 8465 + }, + { + "epoch": 3.125153827221265, + "grad_norm": 0.9205021262168884, + "learning_rate": 2.3580549103971573e-06, + "loss": 0.5871, + "step": 8466 + }, + { + "epoch": 3.1255230125523012, + "grad_norm": 0.9033164978027344, + "learning_rate": 2.357569322809308e-06, + "loss": 0.6055, + "step": 8467 + }, + { + "epoch": 3.1258921978833376, + "grad_norm": 0.9075078368186951, + "learning_rate": 2.3570837406124143e-06, + "loss": 0.6152, + "step": 8468 + }, + { + "epoch": 3.1262613832143735, + "grad_norm": 0.8858950734138489, + "learning_rate": 2.3565981638248553e-06, + "loss": 0.5637, + "step": 8469 + }, + { + "epoch": 3.12663056854541, + "grad_norm": 0.9190322160720825, + "learning_rate": 2.3561125924650115e-06, + "loss": 0.5751, + "step": 8470 + }, + { + "epoch": 3.126999753876446, + "grad_norm": 0.89570552110672, + "learning_rate": 2.3556270265512604e-06, + "loss": 0.6218, + "step": 8471 + }, + { + "epoch": 3.127368939207482, + "grad_norm": 0.932904064655304, + "learning_rate": 2.3551414661019815e-06, + "loss": 0.5931, + "step": 8472 + }, + { + "epoch": 3.1277381245385185, + "grad_norm": 0.9185943007469177, + "learning_rate": 2.3546559111355515e-06, + "loss": 0.6111, + "step": 8473 + }, + { + "epoch": 3.1281073098695544, + "grad_norm": 0.906268298625946, + "learning_rate": 2.3541703616703497e-06, + "loss": 0.5788, + "step": 8474 + }, + { + "epoch": 3.128476495200591, + "grad_norm": 0.9097051024436951, + "learning_rate": 2.353684817724754e-06, + "loss": 0.5784, + "step": 8475 + }, + { + "epoch": 3.1288456805316267, + "grad_norm": 0.9430055022239685, + "learning_rate": 2.353199279317141e-06, + "loss": 0.5887, + "step": 8476 + }, + { + "epoch": 3.129214865862663, + "grad_norm": 0.9396917223930359, + "learning_rate": 2.3527137464658898e-06, + "loss": 0.6027, + "step": 8477 + }, + { + "epoch": 3.1295840511936994, + "grad_norm": 0.9339459538459778, + "learning_rate": 2.352228219189376e-06, + "loss": 0.6236, + "step": 8478 + }, + { + "epoch": 3.1299532365247353, + "grad_norm": 0.9216033816337585, + "learning_rate": 2.3517426975059773e-06, + "loss": 0.5997, + "step": 8479 + }, + { + "epoch": 3.1303224218557717, + "grad_norm": 0.926617443561554, + "learning_rate": 2.3512571814340716e-06, + "loss": 0.62, + "step": 8480 + }, + { + "epoch": 3.1306916071868076, + "grad_norm": 0.9233250617980957, + "learning_rate": 2.3507716709920343e-06, + "loss": 0.6087, + "step": 8481 + }, + { + "epoch": 3.131060792517844, + "grad_norm": 0.9315186738967896, + "learning_rate": 2.3502861661982423e-06, + "loss": 0.6081, + "step": 8482 + }, + { + "epoch": 3.1314299778488803, + "grad_norm": 0.9348911046981812, + "learning_rate": 2.3498006670710715e-06, + "loss": 0.5956, + "step": 8483 + }, + { + "epoch": 3.1317991631799162, + "grad_norm": 0.936145544052124, + "learning_rate": 2.349315173628898e-06, + "loss": 0.6066, + "step": 8484 + }, + { + "epoch": 3.1321683485109526, + "grad_norm": 0.9430466294288635, + "learning_rate": 2.348829685890098e-06, + "loss": 0.5984, + "step": 8485 + }, + { + "epoch": 3.1325375338419885, + "grad_norm": 0.92029869556427, + "learning_rate": 2.348344203873046e-06, + "loss": 0.5845, + "step": 8486 + }, + { + "epoch": 3.132906719173025, + "grad_norm": 0.9373661875724792, + "learning_rate": 2.347858727596119e-06, + "loss": 0.5902, + "step": 8487 + }, + { + "epoch": 3.1332759045040612, + "grad_norm": 0.9223378896713257, + "learning_rate": 2.3473732570776903e-06, + "loss": 0.5988, + "step": 8488 + }, + { + "epoch": 3.133645089835097, + "grad_norm": 0.9272832274436951, + "learning_rate": 2.346887792336135e-06, + "loss": 0.5993, + "step": 8489 + }, + { + "epoch": 3.1340142751661335, + "grad_norm": 0.8868125081062317, + "learning_rate": 2.34640233338983e-06, + "loss": 0.5715, + "step": 8490 + }, + { + "epoch": 3.1343834604971694, + "grad_norm": 0.9223474264144897, + "learning_rate": 2.345916880257147e-06, + "loss": 0.5907, + "step": 8491 + }, + { + "epoch": 3.134752645828206, + "grad_norm": 0.9506804347038269, + "learning_rate": 2.3454314329564617e-06, + "loss": 0.6102, + "step": 8492 + }, + { + "epoch": 3.135121831159242, + "grad_norm": 0.950593888759613, + "learning_rate": 2.3449459915061486e-06, + "loss": 0.5724, + "step": 8493 + }, + { + "epoch": 3.135491016490278, + "grad_norm": 0.9366334676742554, + "learning_rate": 2.3444605559245804e-06, + "loss": 0.6017, + "step": 8494 + }, + { + "epoch": 3.1358602018213144, + "grad_norm": 0.9089887142181396, + "learning_rate": 2.3439751262301313e-06, + "loss": 0.5903, + "step": 8495 + }, + { + "epoch": 3.1362293871523503, + "grad_norm": 0.9075221419334412, + "learning_rate": 2.343489702441174e-06, + "loss": 0.5754, + "step": 8496 + }, + { + "epoch": 3.1365985724833867, + "grad_norm": 0.917420506477356, + "learning_rate": 2.3430042845760812e-06, + "loss": 0.5829, + "step": 8497 + }, + { + "epoch": 3.136967757814423, + "grad_norm": 0.9421452879905701, + "learning_rate": 2.3425188726532287e-06, + "loss": 0.5857, + "step": 8498 + }, + { + "epoch": 3.137336943145459, + "grad_norm": 0.9181631803512573, + "learning_rate": 2.3420334666909856e-06, + "loss": 0.6307, + "step": 8499 + }, + { + "epoch": 3.1377061284764953, + "grad_norm": 0.9021543860435486, + "learning_rate": 2.3415480667077265e-06, + "loss": 0.5904, + "step": 8500 + }, + { + "epoch": 3.1380753138075312, + "grad_norm": 0.9291503429412842, + "learning_rate": 2.341062672721823e-06, + "loss": 0.57, + "step": 8501 + }, + { + "epoch": 3.1384444991385676, + "grad_norm": 0.9328893423080444, + "learning_rate": 2.340577284751647e-06, + "loss": 0.598, + "step": 8502 + }, + { + "epoch": 3.1388136844696035, + "grad_norm": 0.9171802997589111, + "learning_rate": 2.340091902815571e-06, + "loss": 0.6083, + "step": 8503 + }, + { + "epoch": 3.13918286980064, + "grad_norm": 0.9249417185783386, + "learning_rate": 2.3396065269319655e-06, + "loss": 0.5992, + "step": 8504 + }, + { + "epoch": 3.1395520551316762, + "grad_norm": 0.9455564022064209, + "learning_rate": 2.339121157119203e-06, + "loss": 0.5731, + "step": 8505 + }, + { + "epoch": 3.139921240462712, + "grad_norm": 0.9165706038475037, + "learning_rate": 2.3386357933956534e-06, + "loss": 0.6086, + "step": 8506 + }, + { + "epoch": 3.1402904257937485, + "grad_norm": 0.8894111514091492, + "learning_rate": 2.3381504357796877e-06, + "loss": 0.5795, + "step": 8507 + }, + { + "epoch": 3.140659611124785, + "grad_norm": 0.8816021680831909, + "learning_rate": 2.337665084289678e-06, + "loss": 0.5603, + "step": 8508 + }, + { + "epoch": 3.141028796455821, + "grad_norm": 0.943183183670044, + "learning_rate": 2.337179738943992e-06, + "loss": 0.6129, + "step": 8509 + }, + { + "epoch": 3.141397981786857, + "grad_norm": 0.9143814444541931, + "learning_rate": 2.336694399761003e-06, + "loss": 0.6174, + "step": 8510 + }, + { + "epoch": 3.141767167117893, + "grad_norm": 0.9392209649085999, + "learning_rate": 2.336209066759079e-06, + "loss": 0.5945, + "step": 8511 + }, + { + "epoch": 3.1421363524489294, + "grad_norm": 0.8853226900100708, + "learning_rate": 2.33572373995659e-06, + "loss": 0.5622, + "step": 8512 + }, + { + "epoch": 3.1425055377799653, + "grad_norm": 0.9551788568496704, + "learning_rate": 2.3352384193719057e-06, + "loss": 0.6085, + "step": 8513 + }, + { + "epoch": 3.1428747231110017, + "grad_norm": 0.9297163486480713, + "learning_rate": 2.334753105023395e-06, + "loss": 0.6042, + "step": 8514 + }, + { + "epoch": 3.143243908442038, + "grad_norm": 0.9182400703430176, + "learning_rate": 2.334267796929428e-06, + "loss": 0.604, + "step": 8515 + }, + { + "epoch": 3.143613093773074, + "grad_norm": 0.9404250383377075, + "learning_rate": 2.3337824951083716e-06, + "loss": 0.6242, + "step": 8516 + }, + { + "epoch": 3.1439822791041103, + "grad_norm": 0.908069908618927, + "learning_rate": 2.3332971995785945e-06, + "loss": 0.579, + "step": 8517 + }, + { + "epoch": 3.1443514644351462, + "grad_norm": 1.024094581604004, + "learning_rate": 2.3328119103584683e-06, + "loss": 0.5959, + "step": 8518 + }, + { + "epoch": 3.1447206497661826, + "grad_norm": 0.912071168422699, + "learning_rate": 2.3323266274663562e-06, + "loss": 0.5854, + "step": 8519 + }, + { + "epoch": 3.145089835097219, + "grad_norm": 0.9586838483810425, + "learning_rate": 2.3318413509206297e-06, + "loss": 0.6359, + "step": 8520 + }, + { + "epoch": 3.145459020428255, + "grad_norm": 0.9045010805130005, + "learning_rate": 2.331356080739654e-06, + "loss": 0.5782, + "step": 8521 + }, + { + "epoch": 3.1458282057592912, + "grad_norm": 0.9298244118690491, + "learning_rate": 2.3308708169417977e-06, + "loss": 0.5991, + "step": 8522 + }, + { + "epoch": 3.146197391090327, + "grad_norm": 0.9174550175666809, + "learning_rate": 2.3303855595454282e-06, + "loss": 0.6245, + "step": 8523 + }, + { + "epoch": 3.1465665764213635, + "grad_norm": 0.9345257878303528, + "learning_rate": 2.3299003085689116e-06, + "loss": 0.5983, + "step": 8524 + }, + { + "epoch": 3.1469357617524, + "grad_norm": 0.9294345378875732, + "learning_rate": 2.3294150640306144e-06, + "loss": 0.6135, + "step": 8525 + }, + { + "epoch": 3.1473049470834358, + "grad_norm": 0.9463623762130737, + "learning_rate": 2.3289298259489034e-06, + "loss": 0.5955, + "step": 8526 + }, + { + "epoch": 3.147674132414472, + "grad_norm": 0.921941876411438, + "learning_rate": 2.3284445943421435e-06, + "loss": 0.5748, + "step": 8527 + }, + { + "epoch": 3.148043317745508, + "grad_norm": 0.891093373298645, + "learning_rate": 2.3279593692287034e-06, + "loss": 0.5776, + "step": 8528 + }, + { + "epoch": 3.1484125030765444, + "grad_norm": 0.8847788572311401, + "learning_rate": 2.3274741506269448e-06, + "loss": 0.5791, + "step": 8529 + }, + { + "epoch": 3.1487816884075808, + "grad_norm": 0.9006441235542297, + "learning_rate": 2.3269889385552363e-06, + "loss": 0.6068, + "step": 8530 + }, + { + "epoch": 3.1491508737386167, + "grad_norm": 0.9566858410835266, + "learning_rate": 2.326503733031942e-06, + "loss": 0.5916, + "step": 8531 + }, + { + "epoch": 3.149520059069653, + "grad_norm": 0.9041210412979126, + "learning_rate": 2.326018534075426e-06, + "loss": 0.572, + "step": 8532 + }, + { + "epoch": 3.149889244400689, + "grad_norm": 0.9160171747207642, + "learning_rate": 2.3255333417040548e-06, + "loss": 0.5841, + "step": 8533 + }, + { + "epoch": 3.1502584297317253, + "grad_norm": 0.9458027482032776, + "learning_rate": 2.3250481559361904e-06, + "loss": 0.6435, + "step": 8534 + }, + { + "epoch": 3.1506276150627617, + "grad_norm": 0.942514955997467, + "learning_rate": 2.3245629767901976e-06, + "loss": 0.6104, + "step": 8535 + }, + { + "epoch": 3.1509968003937976, + "grad_norm": 0.9664667844772339, + "learning_rate": 2.3240778042844424e-06, + "loss": 0.6256, + "step": 8536 + }, + { + "epoch": 3.151365985724834, + "grad_norm": 0.9075684547424316, + "learning_rate": 2.3235926384372853e-06, + "loss": 0.6127, + "step": 8537 + }, + { + "epoch": 3.15173517105587, + "grad_norm": 0.9048833847045898, + "learning_rate": 2.323107479267093e-06, + "loss": 0.5449, + "step": 8538 + }, + { + "epoch": 3.1521043563869062, + "grad_norm": 0.9355981349945068, + "learning_rate": 2.3226223267922247e-06, + "loss": 0.622, + "step": 8539 + }, + { + "epoch": 3.1524735417179426, + "grad_norm": 0.9520146250724792, + "learning_rate": 2.322137181031046e-06, + "loss": 0.6095, + "step": 8540 + }, + { + "epoch": 3.1528427270489785, + "grad_norm": 0.9299353361129761, + "learning_rate": 2.3216520420019194e-06, + "loss": 0.5779, + "step": 8541 + }, + { + "epoch": 3.153211912380015, + "grad_norm": 0.915331244468689, + "learning_rate": 2.3211669097232065e-06, + "loss": 0.5858, + "step": 8542 + }, + { + "epoch": 3.1535810977110508, + "grad_norm": 0.9144986271858215, + "learning_rate": 2.3206817842132703e-06, + "loss": 0.6029, + "step": 8543 + }, + { + "epoch": 3.153950283042087, + "grad_norm": 0.9237534999847412, + "learning_rate": 2.3201966654904705e-06, + "loss": 0.5858, + "step": 8544 + }, + { + "epoch": 3.1543194683731235, + "grad_norm": 0.9325386881828308, + "learning_rate": 2.3197115535731706e-06, + "loss": 0.5915, + "step": 8545 + }, + { + "epoch": 3.1546886537041594, + "grad_norm": 0.906336784362793, + "learning_rate": 2.3192264484797323e-06, + "loss": 0.5733, + "step": 8546 + }, + { + "epoch": 3.1550578390351958, + "grad_norm": 0.9371514916419983, + "learning_rate": 2.318741350228515e-06, + "loss": 0.6089, + "step": 8547 + }, + { + "epoch": 3.1554270243662317, + "grad_norm": 0.8866013884544373, + "learning_rate": 2.3182562588378814e-06, + "loss": 0.5466, + "step": 8548 + }, + { + "epoch": 3.155796209697268, + "grad_norm": 0.9302651882171631, + "learning_rate": 2.3177711743261897e-06, + "loss": 0.5858, + "step": 8549 + }, + { + "epoch": 3.1561653950283044, + "grad_norm": 0.9000886082649231, + "learning_rate": 2.3172860967118022e-06, + "loss": 0.6018, + "step": 8550 + }, + { + "epoch": 3.1565345803593403, + "grad_norm": 0.9269587397575378, + "learning_rate": 2.3168010260130787e-06, + "loss": 0.5785, + "step": 8551 + }, + { + "epoch": 3.1569037656903767, + "grad_norm": 0.9934300184249878, + "learning_rate": 2.316315962248378e-06, + "loss": 0.6126, + "step": 8552 + }, + { + "epoch": 3.1572729510214126, + "grad_norm": 0.9203448295593262, + "learning_rate": 2.3158309054360607e-06, + "loss": 0.58, + "step": 8553 + }, + { + "epoch": 3.157642136352449, + "grad_norm": 0.9438864588737488, + "learning_rate": 2.315345855594485e-06, + "loss": 0.5846, + "step": 8554 + }, + { + "epoch": 3.1580113216834853, + "grad_norm": 0.9393776655197144, + "learning_rate": 2.3148608127420095e-06, + "loss": 0.5955, + "step": 8555 + }, + { + "epoch": 3.158380507014521, + "grad_norm": 0.9144003391265869, + "learning_rate": 2.314375776896996e-06, + "loss": 0.5755, + "step": 8556 + }, + { + "epoch": 3.1587496923455576, + "grad_norm": 0.9257959127426147, + "learning_rate": 2.313890748077799e-06, + "loss": 0.5718, + "step": 8557 + }, + { + "epoch": 3.1591188776765935, + "grad_norm": 0.9231405258178711, + "learning_rate": 2.31340572630278e-06, + "loss": 0.6089, + "step": 8558 + }, + { + "epoch": 3.15948806300763, + "grad_norm": 0.8927510380744934, + "learning_rate": 2.3129207115902936e-06, + "loss": 0.594, + "step": 8559 + }, + { + "epoch": 3.159857248338666, + "grad_norm": 0.9071226716041565, + "learning_rate": 2.3124357039587e-06, + "loss": 0.5634, + "step": 8560 + }, + { + "epoch": 3.160226433669702, + "grad_norm": 0.8923119902610779, + "learning_rate": 2.3119507034263562e-06, + "loss": 0.5616, + "step": 8561 + }, + { + "epoch": 3.1605956190007385, + "grad_norm": 0.8869242072105408, + "learning_rate": 2.311465710011618e-06, + "loss": 0.5855, + "step": 8562 + }, + { + "epoch": 3.1609648043317744, + "grad_norm": 0.9257856011390686, + "learning_rate": 2.310980723732843e-06, + "loss": 0.5852, + "step": 8563 + }, + { + "epoch": 3.1613339896628108, + "grad_norm": 0.8855143785476685, + "learning_rate": 2.31049574460839e-06, + "loss": 0.5983, + "step": 8564 + }, + { + "epoch": 3.161703174993847, + "grad_norm": 0.8802857398986816, + "learning_rate": 2.310010772656611e-06, + "loss": 0.6016, + "step": 8565 + }, + { + "epoch": 3.162072360324883, + "grad_norm": 0.9187138676643372, + "learning_rate": 2.3095258078958663e-06, + "loss": 0.5905, + "step": 8566 + }, + { + "epoch": 3.1624415456559194, + "grad_norm": 0.937039852142334, + "learning_rate": 2.3090408503445078e-06, + "loss": 0.5736, + "step": 8567 + }, + { + "epoch": 3.1628107309869553, + "grad_norm": 0.8839238286018372, + "learning_rate": 2.3085559000208937e-06, + "loss": 0.5688, + "step": 8568 + }, + { + "epoch": 3.1631799163179917, + "grad_norm": 0.9500434398651123, + "learning_rate": 2.3080709569433784e-06, + "loss": 0.6331, + "step": 8569 + }, + { + "epoch": 3.1635491016490276, + "grad_norm": 0.9075508713722229, + "learning_rate": 2.3075860211303166e-06, + "loss": 0.6035, + "step": 8570 + }, + { + "epoch": 3.163918286980064, + "grad_norm": 0.9577450156211853, + "learning_rate": 2.3071010926000636e-06, + "loss": 0.6, + "step": 8571 + }, + { + "epoch": 3.1642874723111003, + "grad_norm": 0.9370021224021912, + "learning_rate": 2.306616171370973e-06, + "loss": 0.6163, + "step": 8572 + }, + { + "epoch": 3.164656657642136, + "grad_norm": 0.9105225801467896, + "learning_rate": 2.3061312574613984e-06, + "loss": 0.5779, + "step": 8573 + }, + { + "epoch": 3.1650258429731726, + "grad_norm": 0.8943333029747009, + "learning_rate": 2.3056463508896965e-06, + "loss": 0.5873, + "step": 8574 + }, + { + "epoch": 3.165395028304209, + "grad_norm": 0.9321951270103455, + "learning_rate": 2.305161451674217e-06, + "loss": 0.578, + "step": 8575 + }, + { + "epoch": 3.165764213635245, + "grad_norm": 0.9186437726020813, + "learning_rate": 2.304676559833317e-06, + "loss": 0.5692, + "step": 8576 + }, + { + "epoch": 3.166133398966281, + "grad_norm": 0.9026046395301819, + "learning_rate": 2.3041916753853457e-06, + "loss": 0.5873, + "step": 8577 + }, + { + "epoch": 3.166502584297317, + "grad_norm": 0.9294548630714417, + "learning_rate": 2.303706798348658e-06, + "loss": 0.6023, + "step": 8578 + }, + { + "epoch": 3.1668717696283535, + "grad_norm": 0.9676361083984375, + "learning_rate": 2.303221928741607e-06, + "loss": 0.6052, + "step": 8579 + }, + { + "epoch": 3.1672409549593894, + "grad_norm": 0.9175201654434204, + "learning_rate": 2.302737066582543e-06, + "loss": 0.5744, + "step": 8580 + }, + { + "epoch": 3.1676101402904258, + "grad_norm": 0.9522669911384583, + "learning_rate": 2.3022522118898195e-06, + "loss": 0.59, + "step": 8581 + }, + { + "epoch": 3.167979325621462, + "grad_norm": 0.9130473136901855, + "learning_rate": 2.301767364681787e-06, + "loss": 0.5887, + "step": 8582 + }, + { + "epoch": 3.168348510952498, + "grad_norm": 0.8894670605659485, + "learning_rate": 2.3012825249767967e-06, + "loss": 0.5448, + "step": 8583 + }, + { + "epoch": 3.1687176962835344, + "grad_norm": 0.9237234592437744, + "learning_rate": 2.300797692793201e-06, + "loss": 0.6112, + "step": 8584 + }, + { + "epoch": 3.1690868816145707, + "grad_norm": 0.9106210470199585, + "learning_rate": 2.300312868149349e-06, + "loss": 0.5642, + "step": 8585 + }, + { + "epoch": 3.1694560669456067, + "grad_norm": 0.914417564868927, + "learning_rate": 2.2998280510635933e-06, + "loss": 0.5558, + "step": 8586 + }, + { + "epoch": 3.169825252276643, + "grad_norm": 0.9107036590576172, + "learning_rate": 2.299343241554281e-06, + "loss": 0.5778, + "step": 8587 + }, + { + "epoch": 3.170194437607679, + "grad_norm": 0.9164878129959106, + "learning_rate": 2.2988584396397642e-06, + "loss": 0.5616, + "step": 8588 + }, + { + "epoch": 3.1705636229387153, + "grad_norm": 0.934649646282196, + "learning_rate": 2.2983736453383925e-06, + "loss": 0.6004, + "step": 8589 + }, + { + "epoch": 3.170932808269751, + "grad_norm": 0.9202089905738831, + "learning_rate": 2.297888858668514e-06, + "loss": 0.6018, + "step": 8590 + }, + { + "epoch": 3.1713019936007876, + "grad_norm": 0.9014228582382202, + "learning_rate": 2.2974040796484794e-06, + "loss": 0.5937, + "step": 8591 + }, + { + "epoch": 3.171671178931824, + "grad_norm": 0.9038398861885071, + "learning_rate": 2.2969193082966353e-06, + "loss": 0.5424, + "step": 8592 + }, + { + "epoch": 3.17204036426286, + "grad_norm": 0.9409590363502502, + "learning_rate": 2.2964345446313315e-06, + "loss": 0.6322, + "step": 8593 + }, + { + "epoch": 3.172409549593896, + "grad_norm": 0.9318797588348389, + "learning_rate": 2.2959497886709167e-06, + "loss": 0.6102, + "step": 8594 + }, + { + "epoch": 3.172778734924932, + "grad_norm": 0.9150862693786621, + "learning_rate": 2.295465040433737e-06, + "loss": 0.5979, + "step": 8595 + }, + { + "epoch": 3.1731479202559685, + "grad_norm": 0.8967246413230896, + "learning_rate": 2.294980299938142e-06, + "loss": 0.5635, + "step": 8596 + }, + { + "epoch": 3.173517105587005, + "grad_norm": 0.9375881552696228, + "learning_rate": 2.2944955672024772e-06, + "loss": 0.6197, + "step": 8597 + }, + { + "epoch": 3.1738862909180408, + "grad_norm": 0.9383220672607422, + "learning_rate": 2.2940108422450904e-06, + "loss": 0.6165, + "step": 8598 + }, + { + "epoch": 3.174255476249077, + "grad_norm": 0.9041957855224609, + "learning_rate": 2.2935261250843284e-06, + "loss": 0.5673, + "step": 8599 + }, + { + "epoch": 3.174624661580113, + "grad_norm": 0.9180042147636414, + "learning_rate": 2.2930414157385372e-06, + "loss": 0.6163, + "step": 8600 + }, + { + "epoch": 3.1749938469111494, + "grad_norm": 0.9094588160514832, + "learning_rate": 2.292556714226063e-06, + "loss": 0.5768, + "step": 8601 + }, + { + "epoch": 3.1753630322421857, + "grad_norm": 0.924124538898468, + "learning_rate": 2.2920720205652524e-06, + "loss": 0.6046, + "step": 8602 + }, + { + "epoch": 3.1757322175732217, + "grad_norm": 0.9404337406158447, + "learning_rate": 2.2915873347744492e-06, + "loss": 0.5989, + "step": 8603 + }, + { + "epoch": 3.176101402904258, + "grad_norm": 0.8982342481613159, + "learning_rate": 2.291102656872001e-06, + "loss": 0.5741, + "step": 8604 + }, + { + "epoch": 3.176470588235294, + "grad_norm": 0.9373880624771118, + "learning_rate": 2.2906179868762496e-06, + "loss": 0.6127, + "step": 8605 + }, + { + "epoch": 3.1768397735663303, + "grad_norm": 0.9127764701843262, + "learning_rate": 2.290133324805542e-06, + "loss": 0.5874, + "step": 8606 + }, + { + "epoch": 3.1772089588973667, + "grad_norm": 0.9014635682106018, + "learning_rate": 2.2896486706782225e-06, + "loss": 0.5587, + "step": 8607 + }, + { + "epoch": 3.1775781442284026, + "grad_norm": 0.8955909013748169, + "learning_rate": 2.2891640245126336e-06, + "loss": 0.5775, + "step": 8608 + }, + { + "epoch": 3.177947329559439, + "grad_norm": 0.9161983728408813, + "learning_rate": 2.2886793863271205e-06, + "loss": 0.5853, + "step": 8609 + }, + { + "epoch": 3.178316514890475, + "grad_norm": 0.8816854953765869, + "learning_rate": 2.2881947561400256e-06, + "loss": 0.6084, + "step": 8610 + }, + { + "epoch": 3.178685700221511, + "grad_norm": 0.9290160536766052, + "learning_rate": 2.287710133969692e-06, + "loss": 0.586, + "step": 8611 + }, + { + "epoch": 3.1790548855525476, + "grad_norm": 0.9576846957206726, + "learning_rate": 2.2872255198344636e-06, + "loss": 0.5682, + "step": 8612 + }, + { + "epoch": 3.1794240708835835, + "grad_norm": 0.9331993460655212, + "learning_rate": 2.2867409137526812e-06, + "loss": 0.6272, + "step": 8613 + }, + { + "epoch": 3.17979325621462, + "grad_norm": 0.9325345754623413, + "learning_rate": 2.286256315742689e-06, + "loss": 0.6143, + "step": 8614 + }, + { + "epoch": 3.1801624415456557, + "grad_norm": 0.9305222630500793, + "learning_rate": 2.2857717258228265e-06, + "loss": 0.5824, + "step": 8615 + }, + { + "epoch": 3.180531626876692, + "grad_norm": 0.8985062837600708, + "learning_rate": 2.285287144011437e-06, + "loss": 0.6243, + "step": 8616 + }, + { + "epoch": 3.1809008122077285, + "grad_norm": 0.9298938512802124, + "learning_rate": 2.284802570326862e-06, + "loss": 0.5863, + "step": 8617 + }, + { + "epoch": 3.1812699975387644, + "grad_norm": 0.9112500548362732, + "learning_rate": 2.284318004787441e-06, + "loss": 0.5838, + "step": 8618 + }, + { + "epoch": 3.1816391828698007, + "grad_norm": 0.9241980314254761, + "learning_rate": 2.2838334474115165e-06, + "loss": 0.5741, + "step": 8619 + }, + { + "epoch": 3.1820083682008367, + "grad_norm": 0.9027613997459412, + "learning_rate": 2.283348898217427e-06, + "loss": 0.6047, + "step": 8620 + }, + { + "epoch": 3.182377553531873, + "grad_norm": 0.9223675727844238, + "learning_rate": 2.2828643572235125e-06, + "loss": 0.582, + "step": 8621 + }, + { + "epoch": 3.1827467388629094, + "grad_norm": 0.9269042015075684, + "learning_rate": 2.2823798244481156e-06, + "loss": 0.581, + "step": 8622 + }, + { + "epoch": 3.1831159241939453, + "grad_norm": 0.9155677556991577, + "learning_rate": 2.2818952999095716e-06, + "loss": 0.5745, + "step": 8623 + }, + { + "epoch": 3.1834851095249816, + "grad_norm": 0.9264360070228577, + "learning_rate": 2.2814107836262234e-06, + "loss": 0.6047, + "step": 8624 + }, + { + "epoch": 3.1838542948560176, + "grad_norm": 0.9125885963439941, + "learning_rate": 2.280926275616407e-06, + "loss": 0.5863, + "step": 8625 + }, + { + "epoch": 3.184223480187054, + "grad_norm": 0.9225047826766968, + "learning_rate": 2.2804417758984616e-06, + "loss": 0.611, + "step": 8626 + }, + { + "epoch": 3.1845926655180903, + "grad_norm": 0.9119028449058533, + "learning_rate": 2.279957284490727e-06, + "loss": 0.583, + "step": 8627 + }, + { + "epoch": 3.184961850849126, + "grad_norm": 0.8948141932487488, + "learning_rate": 2.2794728014115384e-06, + "loss": 0.6034, + "step": 8628 + }, + { + "epoch": 3.1853310361801626, + "grad_norm": 0.945681631565094, + "learning_rate": 2.278988326679236e-06, + "loss": 0.6011, + "step": 8629 + }, + { + "epoch": 3.1857002215111985, + "grad_norm": 0.8947462439537048, + "learning_rate": 2.278503860312155e-06, + "loss": 0.5741, + "step": 8630 + }, + { + "epoch": 3.186069406842235, + "grad_norm": 0.9098796248435974, + "learning_rate": 2.278019402328632e-06, + "loss": 0.5637, + "step": 8631 + }, + { + "epoch": 3.186438592173271, + "grad_norm": 0.8883733749389648, + "learning_rate": 2.2775349527470064e-06, + "loss": 0.5545, + "step": 8632 + }, + { + "epoch": 3.186807777504307, + "grad_norm": 0.9368519186973572, + "learning_rate": 2.277050511585611e-06, + "loss": 0.6067, + "step": 8633 + }, + { + "epoch": 3.1871769628353435, + "grad_norm": 0.9290332198143005, + "learning_rate": 2.276566078862785e-06, + "loss": 0.6247, + "step": 8634 + }, + { + "epoch": 3.1875461481663794, + "grad_norm": 0.8963862061500549, + "learning_rate": 2.2760816545968608e-06, + "loss": 0.6011, + "step": 8635 + }, + { + "epoch": 3.1879153334974157, + "grad_norm": 0.9051644206047058, + "learning_rate": 2.2755972388061757e-06, + "loss": 0.6167, + "step": 8636 + }, + { + "epoch": 3.1882845188284517, + "grad_norm": 0.9003005027770996, + "learning_rate": 2.275112831509065e-06, + "loss": 0.5628, + "step": 8637 + }, + { + "epoch": 3.188653704159488, + "grad_norm": 0.9585530757904053, + "learning_rate": 2.274628432723862e-06, + "loss": 0.6241, + "step": 8638 + }, + { + "epoch": 3.1890228894905244, + "grad_norm": 0.9203804135322571, + "learning_rate": 2.274144042468902e-06, + "loss": 0.5847, + "step": 8639 + }, + { + "epoch": 3.1893920748215603, + "grad_norm": 0.907270073890686, + "learning_rate": 2.273659660762519e-06, + "loss": 0.5882, + "step": 8640 + }, + { + "epoch": 3.1897612601525966, + "grad_norm": 0.9076530933380127, + "learning_rate": 2.273175287623045e-06, + "loss": 0.6192, + "step": 8641 + }, + { + "epoch": 3.190130445483633, + "grad_norm": 0.9346904158592224, + "learning_rate": 2.2726909230688173e-06, + "loss": 0.6099, + "step": 8642 + }, + { + "epoch": 3.190499630814669, + "grad_norm": 0.9309437870979309, + "learning_rate": 2.272206567118164e-06, + "loss": 0.6006, + "step": 8643 + }, + { + "epoch": 3.1908688161457053, + "grad_norm": 0.9333903789520264, + "learning_rate": 2.271722219789421e-06, + "loss": 0.5543, + "step": 8644 + }, + { + "epoch": 3.191238001476741, + "grad_norm": 0.9237717390060425, + "learning_rate": 2.2712378811009206e-06, + "loss": 0.5826, + "step": 8645 + }, + { + "epoch": 3.1916071868077776, + "grad_norm": 0.924605131149292, + "learning_rate": 2.2707535510709937e-06, + "loss": 0.5554, + "step": 8646 + }, + { + "epoch": 3.1919763721388135, + "grad_norm": 0.9408383965492249, + "learning_rate": 2.270269229717973e-06, + "loss": 0.649, + "step": 8647 + }, + { + "epoch": 3.19234555746985, + "grad_norm": 0.9340365529060364, + "learning_rate": 2.269784917060189e-06, + "loss": 0.5831, + "step": 8648 + }, + { + "epoch": 3.192714742800886, + "grad_norm": 0.9434838891029358, + "learning_rate": 2.269300613115973e-06, + "loss": 0.6046, + "step": 8649 + }, + { + "epoch": 3.193083928131922, + "grad_norm": 0.9395344257354736, + "learning_rate": 2.2688163179036567e-06, + "loss": 0.5757, + "step": 8650 + }, + { + "epoch": 3.1934531134629585, + "grad_norm": 0.911403477191925, + "learning_rate": 2.2683320314415682e-06, + "loss": 0.5802, + "step": 8651 + }, + { + "epoch": 3.193822298793995, + "grad_norm": 0.9384064674377441, + "learning_rate": 2.2678477537480415e-06, + "loss": 0.5771, + "step": 8652 + }, + { + "epoch": 3.1941914841250307, + "grad_norm": 0.9542337656021118, + "learning_rate": 2.267363484841402e-06, + "loss": 0.592, + "step": 8653 + }, + { + "epoch": 3.194560669456067, + "grad_norm": 0.9318562746047974, + "learning_rate": 2.2668792247399822e-06, + "loss": 0.618, + "step": 8654 + }, + { + "epoch": 3.194929854787103, + "grad_norm": 0.9095702171325684, + "learning_rate": 2.26639497346211e-06, + "loss": 0.5725, + "step": 8655 + }, + { + "epoch": 3.1952990401181394, + "grad_norm": 0.917319655418396, + "learning_rate": 2.2659107310261143e-06, + "loss": 0.5801, + "step": 8656 + }, + { + "epoch": 3.1956682254491753, + "grad_norm": 0.9337700605392456, + "learning_rate": 2.265426497450324e-06, + "loss": 0.5658, + "step": 8657 + }, + { + "epoch": 3.1960374107802116, + "grad_norm": 0.9154701232910156, + "learning_rate": 2.2649422727530662e-06, + "loss": 0.6009, + "step": 8658 + }, + { + "epoch": 3.196406596111248, + "grad_norm": 0.9296781420707703, + "learning_rate": 2.2644580569526694e-06, + "loss": 0.5797, + "step": 8659 + }, + { + "epoch": 3.196775781442284, + "grad_norm": 0.9050329327583313, + "learning_rate": 2.263973850067461e-06, + "loss": 0.595, + "step": 8660 + }, + { + "epoch": 3.1971449667733203, + "grad_norm": 0.9048101305961609, + "learning_rate": 2.263489652115767e-06, + "loss": 0.5683, + "step": 8661 + }, + { + "epoch": 3.197514152104356, + "grad_norm": 0.9141092896461487, + "learning_rate": 2.2630054631159167e-06, + "loss": 0.6253, + "step": 8662 + }, + { + "epoch": 3.1978833374353925, + "grad_norm": 0.9157530069351196, + "learning_rate": 2.2625212830862328e-06, + "loss": 0.5698, + "step": 8663 + }, + { + "epoch": 3.198252522766429, + "grad_norm": 0.9130691885948181, + "learning_rate": 2.2620371120450444e-06, + "loss": 0.5532, + "step": 8664 + }, + { + "epoch": 3.198621708097465, + "grad_norm": 0.9308655858039856, + "learning_rate": 2.2615529500106763e-06, + "loss": 0.59, + "step": 8665 + }, + { + "epoch": 3.198990893428501, + "grad_norm": 0.9262445569038391, + "learning_rate": 2.2610687970014535e-06, + "loss": 0.552, + "step": 8666 + }, + { + "epoch": 3.199360078759537, + "grad_norm": 0.9269899725914001, + "learning_rate": 2.260584653035702e-06, + "loss": 0.6098, + "step": 8667 + }, + { + "epoch": 3.1997292640905735, + "grad_norm": 0.9037594199180603, + "learning_rate": 2.260100518131745e-06, + "loss": 0.5866, + "step": 8668 + }, + { + "epoch": 3.20009844942161, + "grad_norm": 0.916001558303833, + "learning_rate": 2.259616392307908e-06, + "loss": 0.5867, + "step": 8669 + }, + { + "epoch": 3.2004676347526457, + "grad_norm": 0.9071817994117737, + "learning_rate": 2.259132275582515e-06, + "loss": 0.5788, + "step": 8670 + }, + { + "epoch": 3.200836820083682, + "grad_norm": 0.9056722521781921, + "learning_rate": 2.258648167973888e-06, + "loss": 0.5946, + "step": 8671 + }, + { + "epoch": 3.201206005414718, + "grad_norm": 0.902793824672699, + "learning_rate": 2.258164069500354e-06, + "loss": 0.592, + "step": 8672 + }, + { + "epoch": 3.2015751907457544, + "grad_norm": 0.9295830726623535, + "learning_rate": 2.2576799801802315e-06, + "loss": 0.5824, + "step": 8673 + }, + { + "epoch": 3.2019443760767907, + "grad_norm": 0.9262397289276123, + "learning_rate": 2.257195900031846e-06, + "loss": 0.6057, + "step": 8674 + }, + { + "epoch": 3.2023135614078266, + "grad_norm": 0.9247784614562988, + "learning_rate": 2.2567118290735194e-06, + "loss": 0.5877, + "step": 8675 + }, + { + "epoch": 3.202682746738863, + "grad_norm": 0.8737269043922424, + "learning_rate": 2.256227767323573e-06, + "loss": 0.5535, + "step": 8676 + }, + { + "epoch": 3.203051932069899, + "grad_norm": 0.9367660880088806, + "learning_rate": 2.2557437148003286e-06, + "loss": 0.6069, + "step": 8677 + }, + { + "epoch": 3.2034211174009353, + "grad_norm": 0.9082227349281311, + "learning_rate": 2.2552596715221083e-06, + "loss": 0.5391, + "step": 8678 + }, + { + "epoch": 3.2037903027319716, + "grad_norm": 0.9268051981925964, + "learning_rate": 2.254775637507231e-06, + "loss": 0.6288, + "step": 8679 + }, + { + "epoch": 3.2041594880630075, + "grad_norm": 0.9314618110656738, + "learning_rate": 2.2542916127740194e-06, + "loss": 0.5975, + "step": 8680 + }, + { + "epoch": 3.204528673394044, + "grad_norm": 0.9104109406471252, + "learning_rate": 2.2538075973407917e-06, + "loss": 0.6207, + "step": 8681 + }, + { + "epoch": 3.20489785872508, + "grad_norm": 0.9044783115386963, + "learning_rate": 2.253323591225869e-06, + "loss": 0.5556, + "step": 8682 + }, + { + "epoch": 3.205267044056116, + "grad_norm": 0.9340513944625854, + "learning_rate": 2.252839594447571e-06, + "loss": 0.5898, + "step": 8683 + }, + { + "epoch": 3.2056362293871525, + "grad_norm": 0.9634924530982971, + "learning_rate": 2.252355607024216e-06, + "loss": 0.5897, + "step": 8684 + }, + { + "epoch": 3.2060054147181885, + "grad_norm": 0.9329057335853577, + "learning_rate": 2.251871628974124e-06, + "loss": 0.585, + "step": 8685 + }, + { + "epoch": 3.206374600049225, + "grad_norm": 0.9271694421768188, + "learning_rate": 2.2513876603156115e-06, + "loss": 0.6182, + "step": 8686 + }, + { + "epoch": 3.2067437853802607, + "grad_norm": 0.8941601514816284, + "learning_rate": 2.250903701066998e-06, + "loss": 0.5568, + "step": 8687 + }, + { + "epoch": 3.207112970711297, + "grad_norm": 0.9370148181915283, + "learning_rate": 2.2504197512466008e-06, + "loss": 0.5918, + "step": 8688 + }, + { + "epoch": 3.2074821560423334, + "grad_norm": 0.9012881517410278, + "learning_rate": 2.249935810872737e-06, + "loss": 0.5726, + "step": 8689 + }, + { + "epoch": 3.2078513413733694, + "grad_norm": 0.9455480575561523, + "learning_rate": 2.2494518799637243e-06, + "loss": 0.6096, + "step": 8690 + }, + { + "epoch": 3.2082205267044057, + "grad_norm": 0.8956812620162964, + "learning_rate": 2.2489679585378778e-06, + "loss": 0.5709, + "step": 8691 + }, + { + "epoch": 3.2085897120354416, + "grad_norm": 0.9041569232940674, + "learning_rate": 2.2484840466135154e-06, + "loss": 0.5623, + "step": 8692 + }, + { + "epoch": 3.208958897366478, + "grad_norm": 0.9307026863098145, + "learning_rate": 2.2480001442089533e-06, + "loss": 0.5235, + "step": 8693 + }, + { + "epoch": 3.2093280826975144, + "grad_norm": 0.9330729246139526, + "learning_rate": 2.2475162513425052e-06, + "loss": 0.6131, + "step": 8694 + }, + { + "epoch": 3.2096972680285503, + "grad_norm": 0.9635587334632874, + "learning_rate": 2.2470323680324883e-06, + "loss": 0.6153, + "step": 8695 + }, + { + "epoch": 3.2100664533595866, + "grad_norm": 0.9334931373596191, + "learning_rate": 2.2465484942972153e-06, + "loss": 0.5975, + "step": 8696 + }, + { + "epoch": 3.2104356386906225, + "grad_norm": 0.9289410710334778, + "learning_rate": 2.2460646301550026e-06, + "loss": 0.5927, + "step": 8697 + }, + { + "epoch": 3.210804824021659, + "grad_norm": 0.8897057771682739, + "learning_rate": 2.2455807756241633e-06, + "loss": 0.5979, + "step": 8698 + }, + { + "epoch": 3.2111740093526953, + "grad_norm": 0.9028593301773071, + "learning_rate": 2.2450969307230112e-06, + "loss": 0.5921, + "step": 8699 + }, + { + "epoch": 3.211543194683731, + "grad_norm": 1.01063072681427, + "learning_rate": 2.2446130954698604e-06, + "loss": 0.6314, + "step": 8700 + }, + { + "epoch": 3.2119123800147675, + "grad_norm": 0.8796990513801575, + "learning_rate": 2.244129269883022e-06, + "loss": 0.5307, + "step": 8701 + }, + { + "epoch": 3.2122815653458034, + "grad_norm": 0.8860458135604858, + "learning_rate": 2.2436454539808104e-06, + "loss": 0.5696, + "step": 8702 + }, + { + "epoch": 3.21265075067684, + "grad_norm": 0.907902181148529, + "learning_rate": 2.243161647781538e-06, + "loss": 0.607, + "step": 8703 + }, + { + "epoch": 3.2130199360078757, + "grad_norm": 0.9309147596359253, + "learning_rate": 2.2426778513035164e-06, + "loss": 0.5713, + "step": 8704 + }, + { + "epoch": 3.213389121338912, + "grad_norm": 0.9226629734039307, + "learning_rate": 2.2421940645650565e-06, + "loss": 0.6387, + "step": 8705 + }, + { + "epoch": 3.2137583066699484, + "grad_norm": 0.9121406674385071, + "learning_rate": 2.24171028758447e-06, + "loss": 0.5712, + "step": 8706 + }, + { + "epoch": 3.2141274920009844, + "grad_norm": 0.9391648769378662, + "learning_rate": 2.2412265203800674e-06, + "loss": 0.6296, + "step": 8707 + }, + { + "epoch": 3.2144966773320207, + "grad_norm": 0.9183779954910278, + "learning_rate": 2.2407427629701597e-06, + "loss": 0.613, + "step": 8708 + }, + { + "epoch": 3.214865862663057, + "grad_norm": 0.9314582943916321, + "learning_rate": 2.2402590153730562e-06, + "loss": 0.5894, + "step": 8709 + }, + { + "epoch": 3.215235047994093, + "grad_norm": 0.912774384021759, + "learning_rate": 2.239775277607068e-06, + "loss": 0.5774, + "step": 8710 + }, + { + "epoch": 3.2156042333251293, + "grad_norm": 0.9159666895866394, + "learning_rate": 2.2392915496905017e-06, + "loss": 0.584, + "step": 8711 + }, + { + "epoch": 3.2159734186561653, + "grad_norm": 0.9288997650146484, + "learning_rate": 2.2388078316416683e-06, + "loss": 0.567, + "step": 8712 + }, + { + "epoch": 3.2163426039872016, + "grad_norm": 0.9428343772888184, + "learning_rate": 2.238324123478877e-06, + "loss": 0.6413, + "step": 8713 + }, + { + "epoch": 3.2167117893182375, + "grad_norm": 0.9343757033348083, + "learning_rate": 2.2378404252204344e-06, + "loss": 0.5829, + "step": 8714 + }, + { + "epoch": 3.217080974649274, + "grad_norm": 0.9383882284164429, + "learning_rate": 2.237356736884649e-06, + "loss": 0.597, + "step": 8715 + }, + { + "epoch": 3.2174501599803103, + "grad_norm": 0.9528592824935913, + "learning_rate": 2.236873058489829e-06, + "loss": 0.6079, + "step": 8716 + }, + { + "epoch": 3.217819345311346, + "grad_norm": 0.927300751209259, + "learning_rate": 2.2363893900542797e-06, + "loss": 0.5954, + "step": 8717 + }, + { + "epoch": 3.2181885306423825, + "grad_norm": 0.9205749034881592, + "learning_rate": 2.2359057315963097e-06, + "loss": 0.5891, + "step": 8718 + }, + { + "epoch": 3.218557715973419, + "grad_norm": 0.9409298300743103, + "learning_rate": 2.2354220831342238e-06, + "loss": 0.6257, + "step": 8719 + }, + { + "epoch": 3.218926901304455, + "grad_norm": 0.9324811697006226, + "learning_rate": 2.234938444686328e-06, + "loss": 0.5955, + "step": 8720 + }, + { + "epoch": 3.219296086635491, + "grad_norm": 0.9127917289733887, + "learning_rate": 2.2344548162709294e-06, + "loss": 0.6066, + "step": 8721 + }, + { + "epoch": 3.219665271966527, + "grad_norm": 1.0422576665878296, + "learning_rate": 2.2339711979063318e-06, + "loss": 0.5753, + "step": 8722 + }, + { + "epoch": 3.2200344572975634, + "grad_norm": 0.8956278562545776, + "learning_rate": 2.2334875896108404e-06, + "loss": 0.6091, + "step": 8723 + }, + { + "epoch": 3.2204036426285993, + "grad_norm": 0.8983429074287415, + "learning_rate": 2.2330039914027597e-06, + "loss": 0.5949, + "step": 8724 + }, + { + "epoch": 3.2207728279596357, + "grad_norm": 0.9057712554931641, + "learning_rate": 2.2325204033003933e-06, + "loss": 0.5862, + "step": 8725 + }, + { + "epoch": 3.221142013290672, + "grad_norm": 0.9280909299850464, + "learning_rate": 2.232036825322046e-06, + "loss": 0.5928, + "step": 8726 + }, + { + "epoch": 3.221511198621708, + "grad_norm": 0.9242276549339294, + "learning_rate": 2.2315532574860196e-06, + "loss": 0.6432, + "step": 8727 + }, + { + "epoch": 3.2218803839527443, + "grad_norm": 0.9083799719810486, + "learning_rate": 2.2310696998106186e-06, + "loss": 0.5653, + "step": 8728 + }, + { + "epoch": 3.2222495692837803, + "grad_norm": 0.9710299372673035, + "learning_rate": 2.230586152314144e-06, + "loss": 0.6148, + "step": 8729 + }, + { + "epoch": 3.2226187546148166, + "grad_norm": 0.9467484354972839, + "learning_rate": 2.2301026150148975e-06, + "loss": 0.5932, + "step": 8730 + }, + { + "epoch": 3.222987939945853, + "grad_norm": 0.9087077975273132, + "learning_rate": 2.229619087931183e-06, + "loss": 0.5776, + "step": 8731 + }, + { + "epoch": 3.223357125276889, + "grad_norm": 0.9213501811027527, + "learning_rate": 2.2291355710813003e-06, + "loss": 0.5961, + "step": 8732 + }, + { + "epoch": 3.2237263106079252, + "grad_norm": 0.9297762513160706, + "learning_rate": 2.2286520644835513e-06, + "loss": 0.6158, + "step": 8733 + }, + { + "epoch": 3.224095495938961, + "grad_norm": 0.9281464219093323, + "learning_rate": 2.2281685681562352e-06, + "loss": 0.5961, + "step": 8734 + }, + { + "epoch": 3.2244646812699975, + "grad_norm": 0.9243818521499634, + "learning_rate": 2.227685082117653e-06, + "loss": 0.6002, + "step": 8735 + }, + { + "epoch": 3.224833866601034, + "grad_norm": 0.9097726941108704, + "learning_rate": 2.2272016063861057e-06, + "loss": 0.5965, + "step": 8736 + }, + { + "epoch": 3.22520305193207, + "grad_norm": 0.9434778094291687, + "learning_rate": 2.2267181409798904e-06, + "loss": 0.5953, + "step": 8737 + }, + { + "epoch": 3.225572237263106, + "grad_norm": 0.9154404401779175, + "learning_rate": 2.226234685917308e-06, + "loss": 0.5884, + "step": 8738 + }, + { + "epoch": 3.225941422594142, + "grad_norm": 0.9232131242752075, + "learning_rate": 2.2257512412166558e-06, + "loss": 0.5671, + "step": 8739 + }, + { + "epoch": 3.2263106079251784, + "grad_norm": 0.9318534731864929, + "learning_rate": 2.225267806896232e-06, + "loss": 0.5818, + "step": 8740 + }, + { + "epoch": 3.226679793256215, + "grad_norm": 0.9412238001823425, + "learning_rate": 2.224784382974336e-06, + "loss": 0.5955, + "step": 8741 + }, + { + "epoch": 3.2270489785872507, + "grad_norm": 1.202566146850586, + "learning_rate": 2.2243009694692635e-06, + "loss": 0.5811, + "step": 8742 + }, + { + "epoch": 3.227418163918287, + "grad_norm": 0.902772068977356, + "learning_rate": 2.223817566399313e-06, + "loss": 0.5853, + "step": 8743 + }, + { + "epoch": 3.227787349249323, + "grad_norm": 0.9262640476226807, + "learning_rate": 2.22333417378278e-06, + "loss": 0.6071, + "step": 8744 + }, + { + "epoch": 3.2281565345803593, + "grad_norm": 0.9406147599220276, + "learning_rate": 2.2228507916379613e-06, + "loss": 0.5668, + "step": 8745 + }, + { + "epoch": 3.2285257199113957, + "grad_norm": 0.890205979347229, + "learning_rate": 2.222367419983153e-06, + "loss": 0.5753, + "step": 8746 + }, + { + "epoch": 3.2288949052424316, + "grad_norm": 0.9725748896598816, + "learning_rate": 2.22188405883665e-06, + "loss": 0.6111, + "step": 8747 + }, + { + "epoch": 3.229264090573468, + "grad_norm": 0.9062351584434509, + "learning_rate": 2.2214007082167477e-06, + "loss": 0.6019, + "step": 8748 + }, + { + "epoch": 3.229633275904504, + "grad_norm": 0.9094552993774414, + "learning_rate": 2.2209173681417405e-06, + "loss": 0.5788, + "step": 8749 + }, + { + "epoch": 3.2300024612355402, + "grad_norm": 0.9257186055183411, + "learning_rate": 2.220434038629922e-06, + "loss": 0.5844, + "step": 8750 + }, + { + "epoch": 3.2303716465665766, + "grad_norm": 0.9409226775169373, + "learning_rate": 2.219950719699588e-06, + "loss": 0.5969, + "step": 8751 + }, + { + "epoch": 3.2307408318976125, + "grad_norm": 0.9672120809555054, + "learning_rate": 2.2194674113690304e-06, + "loss": 0.6344, + "step": 8752 + }, + { + "epoch": 3.231110017228649, + "grad_norm": 0.9085201025009155, + "learning_rate": 2.2189841136565426e-06, + "loss": 0.5676, + "step": 8753 + }, + { + "epoch": 3.231479202559685, + "grad_norm": 0.9135740399360657, + "learning_rate": 2.2185008265804186e-06, + "loss": 0.5861, + "step": 8754 + }, + { + "epoch": 3.231848387890721, + "grad_norm": 0.9307097792625427, + "learning_rate": 2.2180175501589484e-06, + "loss": 0.5879, + "step": 8755 + }, + { + "epoch": 3.2322175732217575, + "grad_norm": 0.8995985984802246, + "learning_rate": 2.2175342844104254e-06, + "loss": 0.5798, + "step": 8756 + }, + { + "epoch": 3.2325867585527934, + "grad_norm": 0.8827587962150574, + "learning_rate": 2.21705102935314e-06, + "loss": 0.592, + "step": 8757 + }, + { + "epoch": 3.23295594388383, + "grad_norm": 0.9367154836654663, + "learning_rate": 2.2165677850053833e-06, + "loss": 0.5927, + "step": 8758 + }, + { + "epoch": 3.2333251292148657, + "grad_norm": 0.9345464706420898, + "learning_rate": 2.2160845513854485e-06, + "loss": 0.6157, + "step": 8759 + }, + { + "epoch": 3.233694314545902, + "grad_norm": 0.9029926657676697, + "learning_rate": 2.215601328511622e-06, + "loss": 0.6058, + "step": 8760 + }, + { + "epoch": 3.2340634998769384, + "grad_norm": 0.9389760494232178, + "learning_rate": 2.2151181164021965e-06, + "loss": 0.6114, + "step": 8761 + }, + { + "epoch": 3.2344326852079743, + "grad_norm": 0.8730699419975281, + "learning_rate": 2.2146349150754597e-06, + "loss": 0.5895, + "step": 8762 + }, + { + "epoch": 3.2348018705390107, + "grad_norm": 0.9470748901367188, + "learning_rate": 2.2141517245497016e-06, + "loss": 0.6234, + "step": 8763 + }, + { + "epoch": 3.2351710558700466, + "grad_norm": 0.927790641784668, + "learning_rate": 2.213668544843211e-06, + "loss": 0.6159, + "step": 8764 + }, + { + "epoch": 3.235540241201083, + "grad_norm": 0.89668869972229, + "learning_rate": 2.213185375974275e-06, + "loss": 0.5876, + "step": 8765 + }, + { + "epoch": 3.2359094265321193, + "grad_norm": 0.9092085957527161, + "learning_rate": 2.212702217961183e-06, + "loss": 0.576, + "step": 8766 + }, + { + "epoch": 3.2362786118631552, + "grad_norm": 0.926794707775116, + "learning_rate": 2.21221907082222e-06, + "loss": 0.5796, + "step": 8767 + }, + { + "epoch": 3.2366477971941916, + "grad_norm": 0.9065845012664795, + "learning_rate": 2.211735934575674e-06, + "loss": 0.5681, + "step": 8768 + }, + { + "epoch": 3.2370169825252275, + "grad_norm": 0.8947533369064331, + "learning_rate": 2.2112528092398337e-06, + "loss": 0.5663, + "step": 8769 + }, + { + "epoch": 3.237386167856264, + "grad_norm": 0.9244136214256287, + "learning_rate": 2.210769694832982e-06, + "loss": 0.605, + "step": 8770 + }, + { + "epoch": 3.2377553531873, + "grad_norm": 0.9079532623291016, + "learning_rate": 2.210286591373407e-06, + "loss": 0.5675, + "step": 8771 + }, + { + "epoch": 3.238124538518336, + "grad_norm": 0.9060580730438232, + "learning_rate": 2.2098034988793923e-06, + "loss": 0.6216, + "step": 8772 + }, + { + "epoch": 3.2384937238493725, + "grad_norm": 0.9562442898750305, + "learning_rate": 2.209320417369224e-06, + "loss": 0.6134, + "step": 8773 + }, + { + "epoch": 3.2388629091804084, + "grad_norm": 0.9204705357551575, + "learning_rate": 2.208837346861186e-06, + "loss": 0.5982, + "step": 8774 + }, + { + "epoch": 3.239232094511445, + "grad_norm": 0.9167160391807556, + "learning_rate": 2.208354287373563e-06, + "loss": 0.5866, + "step": 8775 + }, + { + "epoch": 3.239601279842481, + "grad_norm": 0.9185822010040283, + "learning_rate": 2.207871238924638e-06, + "loss": 0.5908, + "step": 8776 + }, + { + "epoch": 3.239970465173517, + "grad_norm": 0.9276142120361328, + "learning_rate": 2.207388201532694e-06, + "loss": 0.6136, + "step": 8777 + }, + { + "epoch": 3.2403396505045534, + "grad_norm": 0.9348015785217285, + "learning_rate": 2.2069051752160135e-06, + "loss": 0.5921, + "step": 8778 + }, + { + "epoch": 3.2407088358355893, + "grad_norm": 0.9385659098625183, + "learning_rate": 2.2064221599928816e-06, + "loss": 0.6046, + "step": 8779 + }, + { + "epoch": 3.2410780211666257, + "grad_norm": 0.95173579454422, + "learning_rate": 2.205939155881577e-06, + "loss": 0.5901, + "step": 8780 + }, + { + "epoch": 3.2414472064976616, + "grad_norm": 0.9165025949478149, + "learning_rate": 2.205456162900383e-06, + "loss": 0.5683, + "step": 8781 + }, + { + "epoch": 3.241816391828698, + "grad_norm": 0.9020248055458069, + "learning_rate": 2.2049731810675798e-06, + "loss": 0.569, + "step": 8782 + }, + { + "epoch": 3.2421855771597343, + "grad_norm": 0.9253556132316589, + "learning_rate": 2.204490210401449e-06, + "loss": 0.5888, + "step": 8783 + }, + { + "epoch": 3.2425547624907702, + "grad_norm": 0.920159637928009, + "learning_rate": 2.20400725092027e-06, + "loss": 0.5862, + "step": 8784 + }, + { + "epoch": 3.2429239478218066, + "grad_norm": 0.9047715067863464, + "learning_rate": 2.2035243026423237e-06, + "loss": 0.5981, + "step": 8785 + }, + { + "epoch": 3.243293133152843, + "grad_norm": 0.8955184817314148, + "learning_rate": 2.203041365585888e-06, + "loss": 0.5849, + "step": 8786 + }, + { + "epoch": 3.243662318483879, + "grad_norm": 0.9117116928100586, + "learning_rate": 2.2025584397692444e-06, + "loss": 0.6011, + "step": 8787 + }, + { + "epoch": 3.2440315038149152, + "grad_norm": 0.9004309177398682, + "learning_rate": 2.2020755252106686e-06, + "loss": 0.5696, + "step": 8788 + }, + { + "epoch": 3.244400689145951, + "grad_norm": 0.9023653268814087, + "learning_rate": 2.2015926219284417e-06, + "loss": 0.5833, + "step": 8789 + }, + { + "epoch": 3.2447698744769875, + "grad_norm": 0.9581223130226135, + "learning_rate": 2.201109729940838e-06, + "loss": 0.6199, + "step": 8790 + }, + { + "epoch": 3.2451390598080234, + "grad_norm": 0.9420719146728516, + "learning_rate": 2.200626849266138e-06, + "loss": 0.6045, + "step": 8791 + }, + { + "epoch": 3.2455082451390598, + "grad_norm": 0.9235360622406006, + "learning_rate": 2.200143979922617e-06, + "loss": 0.598, + "step": 8792 + }, + { + "epoch": 3.245877430470096, + "grad_norm": 0.916246771812439, + "learning_rate": 2.1996611219285516e-06, + "loss": 0.6148, + "step": 8793 + }, + { + "epoch": 3.246246615801132, + "grad_norm": 0.9194026589393616, + "learning_rate": 2.1991782753022185e-06, + "loss": 0.5906, + "step": 8794 + }, + { + "epoch": 3.2466158011321684, + "grad_norm": 0.9249829053878784, + "learning_rate": 2.1986954400618916e-06, + "loss": 0.6267, + "step": 8795 + }, + { + "epoch": 3.2469849864632043, + "grad_norm": 0.8985263705253601, + "learning_rate": 2.1982126162258474e-06, + "loss": 0.5871, + "step": 8796 + }, + { + "epoch": 3.2473541717942407, + "grad_norm": 0.9639427065849304, + "learning_rate": 2.1977298038123617e-06, + "loss": 0.6235, + "step": 8797 + }, + { + "epoch": 3.247723357125277, + "grad_norm": 0.9200267791748047, + "learning_rate": 2.1972470028397056e-06, + "loss": 0.5848, + "step": 8798 + }, + { + "epoch": 3.248092542456313, + "grad_norm": 0.9118986129760742, + "learning_rate": 2.196764213326157e-06, + "loss": 0.6017, + "step": 8799 + }, + { + "epoch": 3.2484617277873493, + "grad_norm": 0.9376638531684875, + "learning_rate": 2.1962814352899857e-06, + "loss": 0.5938, + "step": 8800 + }, + { + "epoch": 3.2488309131183852, + "grad_norm": 0.8984713554382324, + "learning_rate": 2.1957986687494667e-06, + "loss": 0.5997, + "step": 8801 + }, + { + "epoch": 3.2492000984494216, + "grad_norm": 0.9243409037590027, + "learning_rate": 2.1953159137228723e-06, + "loss": 0.6219, + "step": 8802 + }, + { + "epoch": 3.249569283780458, + "grad_norm": 0.9208619594573975, + "learning_rate": 2.194833170228474e-06, + "loss": 0.5778, + "step": 8803 + }, + { + "epoch": 3.249938469111494, + "grad_norm": 0.9549753069877625, + "learning_rate": 2.1943504382845447e-06, + "loss": 0.6299, + "step": 8804 + }, + { + "epoch": 3.2503076544425302, + "grad_norm": 0.9488089084625244, + "learning_rate": 2.1938677179093538e-06, + "loss": 0.6051, + "step": 8805 + }, + { + "epoch": 3.250676839773566, + "grad_norm": 0.917931079864502, + "learning_rate": 2.193385009121173e-06, + "loss": 0.5755, + "step": 8806 + }, + { + "epoch": 3.2510460251046025, + "grad_norm": 0.9196712374687195, + "learning_rate": 2.1929023119382743e-06, + "loss": 0.5755, + "step": 8807 + }, + { + "epoch": 3.251415210435639, + "grad_norm": 0.8901126384735107, + "learning_rate": 2.1924196263789246e-06, + "loss": 0.5824, + "step": 8808 + }, + { + "epoch": 3.2517843957666748, + "grad_norm": 0.9289348721504211, + "learning_rate": 2.191936952461396e-06, + "loss": 0.5887, + "step": 8809 + }, + { + "epoch": 3.252153581097711, + "grad_norm": 0.9166722893714905, + "learning_rate": 2.191454290203956e-06, + "loss": 0.5877, + "step": 8810 + }, + { + "epoch": 3.252522766428747, + "grad_norm": 0.911541759967804, + "learning_rate": 2.1909716396248736e-06, + "loss": 0.5614, + "step": 8811 + }, + { + "epoch": 3.2528919517597834, + "grad_norm": 0.9230117797851562, + "learning_rate": 2.190489000742418e-06, + "loss": 0.6225, + "step": 8812 + }, + { + "epoch": 3.2532611370908198, + "grad_norm": 0.9948002696037292, + "learning_rate": 2.1900063735748554e-06, + "loss": 0.5976, + "step": 8813 + }, + { + "epoch": 3.2536303224218557, + "grad_norm": 0.9089682102203369, + "learning_rate": 2.189523758140454e-06, + "loss": 0.5956, + "step": 8814 + }, + { + "epoch": 3.253999507752892, + "grad_norm": 0.9516114592552185, + "learning_rate": 2.18904115445748e-06, + "loss": 0.5625, + "step": 8815 + }, + { + "epoch": 3.254368693083928, + "grad_norm": 0.9249249696731567, + "learning_rate": 2.1885585625441995e-06, + "loss": 0.6007, + "step": 8816 + }, + { + "epoch": 3.2547378784149643, + "grad_norm": 0.913756787776947, + "learning_rate": 2.188075982418881e-06, + "loss": 0.5704, + "step": 8817 + }, + { + "epoch": 3.2551070637460007, + "grad_norm": 0.9520537853240967, + "learning_rate": 2.1875934140997865e-06, + "loss": 0.5852, + "step": 8818 + }, + { + "epoch": 3.2554762490770366, + "grad_norm": 0.9384584426879883, + "learning_rate": 2.1871108576051835e-06, + "loss": 0.6141, + "step": 8819 + }, + { + "epoch": 3.255845434408073, + "grad_norm": 0.9259538650512695, + "learning_rate": 2.1866283129533357e-06, + "loss": 0.6027, + "step": 8820 + }, + { + "epoch": 3.256214619739109, + "grad_norm": 0.9421133399009705, + "learning_rate": 2.186145780162507e-06, + "loss": 0.6216, + "step": 8821 + }, + { + "epoch": 3.256583805070145, + "grad_norm": 0.9278064966201782, + "learning_rate": 2.185663259250962e-06, + "loss": 0.6242, + "step": 8822 + }, + { + "epoch": 3.2569529904011816, + "grad_norm": 0.8959514498710632, + "learning_rate": 2.1851807502369627e-06, + "loss": 0.5927, + "step": 8823 + }, + { + "epoch": 3.2573221757322175, + "grad_norm": 0.9231398701667786, + "learning_rate": 2.1846982531387723e-06, + "loss": 0.5857, + "step": 8824 + }, + { + "epoch": 3.257691361063254, + "grad_norm": 0.925628662109375, + "learning_rate": 2.184215767974655e-06, + "loss": 0.6102, + "step": 8825 + }, + { + "epoch": 3.2580605463942898, + "grad_norm": 0.9022905230522156, + "learning_rate": 2.18373329476287e-06, + "loss": 0.6142, + "step": 8826 + }, + { + "epoch": 3.258429731725326, + "grad_norm": 0.861957848072052, + "learning_rate": 2.1832508335216814e-06, + "loss": 0.5496, + "step": 8827 + }, + { + "epoch": 3.2587989170563625, + "grad_norm": 0.9203256368637085, + "learning_rate": 2.182768384269347e-06, + "loss": 0.5743, + "step": 8828 + }, + { + "epoch": 3.2591681023873984, + "grad_norm": 0.8977341055870056, + "learning_rate": 2.1822859470241296e-06, + "loss": 0.607, + "step": 8829 + }, + { + "epoch": 3.2595372877184348, + "grad_norm": 0.9288895130157471, + "learning_rate": 2.18180352180429e-06, + "loss": 0.6268, + "step": 8830 + }, + { + "epoch": 3.2599064730494707, + "grad_norm": 0.9352108240127563, + "learning_rate": 2.1813211086280856e-06, + "loss": 0.6245, + "step": 8831 + }, + { + "epoch": 3.260275658380507, + "grad_norm": 0.9241617321968079, + "learning_rate": 2.180838707513777e-06, + "loss": 0.5518, + "step": 8832 + }, + { + "epoch": 3.2606448437115434, + "grad_norm": 0.9491792917251587, + "learning_rate": 2.1803563184796227e-06, + "loss": 0.6133, + "step": 8833 + }, + { + "epoch": 3.2610140290425793, + "grad_norm": 0.9351180195808411, + "learning_rate": 2.1798739415438797e-06, + "loss": 0.5999, + "step": 8834 + }, + { + "epoch": 3.2613832143736157, + "grad_norm": 0.9009407758712769, + "learning_rate": 2.179391576724809e-06, + "loss": 0.586, + "step": 8835 + }, + { + "epoch": 3.2617523997046516, + "grad_norm": 0.925926148891449, + "learning_rate": 2.1789092240406644e-06, + "loss": 0.599, + "step": 8836 + }, + { + "epoch": 3.262121585035688, + "grad_norm": 0.9472907185554504, + "learning_rate": 2.1784268835097054e-06, + "loss": 0.6333, + "step": 8837 + }, + { + "epoch": 3.262490770366724, + "grad_norm": 0.9233747124671936, + "learning_rate": 2.1779445551501864e-06, + "loss": 0.5714, + "step": 8838 + }, + { + "epoch": 3.26285995569776, + "grad_norm": 0.909777820110321, + "learning_rate": 2.1774622389803643e-06, + "loss": 0.5976, + "step": 8839 + }, + { + "epoch": 3.2632291410287966, + "grad_norm": 0.9365139603614807, + "learning_rate": 2.1769799350184956e-06, + "loss": 0.5777, + "step": 8840 + }, + { + "epoch": 3.2635983263598325, + "grad_norm": 0.8909774422645569, + "learning_rate": 2.176497643282834e-06, + "loss": 0.5666, + "step": 8841 + }, + { + "epoch": 3.263967511690869, + "grad_norm": 0.9206159114837646, + "learning_rate": 2.1760153637916344e-06, + "loss": 0.6152, + "step": 8842 + }, + { + "epoch": 3.264336697021905, + "grad_norm": 0.9060840606689453, + "learning_rate": 2.175533096563151e-06, + "loss": 0.5727, + "step": 8843 + }, + { + "epoch": 3.264705882352941, + "grad_norm": 0.9458661079406738, + "learning_rate": 2.1750508416156367e-06, + "loss": 0.6156, + "step": 8844 + }, + { + "epoch": 3.2650750676839775, + "grad_norm": 0.932361364364624, + "learning_rate": 2.1745685989673473e-06, + "loss": 0.6139, + "step": 8845 + }, + { + "epoch": 3.2654442530150134, + "grad_norm": 0.9520567655563354, + "learning_rate": 2.1740863686365323e-06, + "loss": 0.6206, + "step": 8846 + }, + { + "epoch": 3.2658134383460498, + "grad_norm": 0.9202482104301453, + "learning_rate": 2.1736041506414465e-06, + "loss": 0.607, + "step": 8847 + }, + { + "epoch": 3.2661826236770857, + "grad_norm": 0.9402969479560852, + "learning_rate": 2.1731219450003395e-06, + "loss": 0.6354, + "step": 8848 + }, + { + "epoch": 3.266551809008122, + "grad_norm": 0.9310118556022644, + "learning_rate": 2.172639751731464e-06, + "loss": 0.6086, + "step": 8849 + }, + { + "epoch": 3.2669209943391584, + "grad_norm": 0.9147279858589172, + "learning_rate": 2.1721575708530708e-06, + "loss": 0.6156, + "step": 8850 + }, + { + "epoch": 3.2672901796701943, + "grad_norm": 0.9260480403900146, + "learning_rate": 2.17167540238341e-06, + "loss": 0.6291, + "step": 8851 + }, + { + "epoch": 3.2676593650012307, + "grad_norm": 0.9368613958358765, + "learning_rate": 2.171193246340732e-06, + "loss": 0.6239, + "step": 8852 + }, + { + "epoch": 3.268028550332267, + "grad_norm": 0.938654899597168, + "learning_rate": 2.1707111027432855e-06, + "loss": 0.5978, + "step": 8853 + }, + { + "epoch": 3.268397735663303, + "grad_norm": 0.9321526885032654, + "learning_rate": 2.170228971609319e-06, + "loss": 0.625, + "step": 8854 + }, + { + "epoch": 3.2687669209943393, + "grad_norm": 0.922097384929657, + "learning_rate": 2.1697468529570836e-06, + "loss": 0.5961, + "step": 8855 + }, + { + "epoch": 3.269136106325375, + "grad_norm": 0.9242653846740723, + "learning_rate": 2.1692647468048235e-06, + "loss": 0.5812, + "step": 8856 + }, + { + "epoch": 3.2695052916564116, + "grad_norm": 0.907293438911438, + "learning_rate": 2.1687826531707906e-06, + "loss": 0.598, + "step": 8857 + }, + { + "epoch": 3.2698744769874475, + "grad_norm": 0.9145967364311218, + "learning_rate": 2.1683005720732277e-06, + "loss": 0.5971, + "step": 8858 + }, + { + "epoch": 3.270243662318484, + "grad_norm": 0.916080117225647, + "learning_rate": 2.1678185035303846e-06, + "loss": 0.5865, + "step": 8859 + }, + { + "epoch": 3.27061284764952, + "grad_norm": 0.8976455926895142, + "learning_rate": 2.1673364475605064e-06, + "loss": 0.5775, + "step": 8860 + }, + { + "epoch": 3.270982032980556, + "grad_norm": 0.909909725189209, + "learning_rate": 2.166854404181838e-06, + "loss": 0.6099, + "step": 8861 + }, + { + "epoch": 3.2713512183115925, + "grad_norm": 0.9230946898460388, + "learning_rate": 2.166372373412625e-06, + "loss": 0.5824, + "step": 8862 + }, + { + "epoch": 3.271720403642629, + "grad_norm": 0.910338282585144, + "learning_rate": 2.165890355271114e-06, + "loss": 0.6086, + "step": 8863 + }, + { + "epoch": 3.2720895889736648, + "grad_norm": 0.9400829076766968, + "learning_rate": 2.1654083497755456e-06, + "loss": 0.5907, + "step": 8864 + }, + { + "epoch": 3.272458774304701, + "grad_norm": 0.932861864566803, + "learning_rate": 2.1649263569441674e-06, + "loss": 0.5947, + "step": 8865 + }, + { + "epoch": 3.272827959635737, + "grad_norm": 0.9313173294067383, + "learning_rate": 2.1644443767952195e-06, + "loss": 0.596, + "step": 8866 + }, + { + "epoch": 3.2731971449667734, + "grad_norm": 0.9649665355682373, + "learning_rate": 2.1639624093469462e-06, + "loss": 0.6456, + "step": 8867 + }, + { + "epoch": 3.2735663302978093, + "grad_norm": 0.9134054780006409, + "learning_rate": 2.16348045461759e-06, + "loss": 0.5871, + "step": 8868 + }, + { + "epoch": 3.2739355156288457, + "grad_norm": 0.9179554581642151, + "learning_rate": 2.1629985126253928e-06, + "loss": 0.6182, + "step": 8869 + }, + { + "epoch": 3.274304700959882, + "grad_norm": 0.9129515886306763, + "learning_rate": 2.162516583388595e-06, + "loss": 0.6128, + "step": 8870 + }, + { + "epoch": 3.274673886290918, + "grad_norm": 0.9040526747703552, + "learning_rate": 2.162034666925438e-06, + "loss": 0.5532, + "step": 8871 + }, + { + "epoch": 3.2750430716219543, + "grad_norm": 0.9011151194572449, + "learning_rate": 2.1615527632541616e-06, + "loss": 0.5644, + "step": 8872 + }, + { + "epoch": 3.27541225695299, + "grad_norm": 0.926634669303894, + "learning_rate": 2.1610708723930078e-06, + "loss": 0.6026, + "step": 8873 + }, + { + "epoch": 3.2757814422840266, + "grad_norm": 0.8955903053283691, + "learning_rate": 2.1605889943602133e-06, + "loss": 0.5705, + "step": 8874 + }, + { + "epoch": 3.276150627615063, + "grad_norm": 0.8854711055755615, + "learning_rate": 2.1601071291740193e-06, + "loss": 0.5601, + "step": 8875 + }, + { + "epoch": 3.276519812946099, + "grad_norm": 0.9160422086715698, + "learning_rate": 2.159625276852662e-06, + "loss": 0.6335, + "step": 8876 + }, + { + "epoch": 3.276888998277135, + "grad_norm": 0.8947936296463013, + "learning_rate": 2.159143437414381e-06, + "loss": 0.5574, + "step": 8877 + }, + { + "epoch": 3.277258183608171, + "grad_norm": 0.9153184294700623, + "learning_rate": 2.158661610877414e-06, + "loss": 0.5908, + "step": 8878 + }, + { + "epoch": 3.2776273689392075, + "grad_norm": 0.9065198302268982, + "learning_rate": 2.158179797259996e-06, + "loss": 0.6057, + "step": 8879 + }, + { + "epoch": 3.277996554270244, + "grad_norm": 0.9266895651817322, + "learning_rate": 2.1576979965803663e-06, + "loss": 0.6126, + "step": 8880 + }, + { + "epoch": 3.2783657396012797, + "grad_norm": 0.9075735211372375, + "learning_rate": 2.1572162088567584e-06, + "loss": 0.5699, + "step": 8881 + }, + { + "epoch": 3.278734924932316, + "grad_norm": 0.9054121375083923, + "learning_rate": 2.1567344341074082e-06, + "loss": 0.5651, + "step": 8882 + }, + { + "epoch": 3.279104110263352, + "grad_norm": 0.929426372051239, + "learning_rate": 2.156252672350553e-06, + "loss": 0.573, + "step": 8883 + }, + { + "epoch": 3.2794732955943884, + "grad_norm": 0.9284286499023438, + "learning_rate": 2.1557709236044243e-06, + "loss": 0.5901, + "step": 8884 + }, + { + "epoch": 3.2798424809254247, + "grad_norm": 0.9499627351760864, + "learning_rate": 2.1552891878872587e-06, + "loss": 0.6168, + "step": 8885 + }, + { + "epoch": 3.2802116662564607, + "grad_norm": 0.9541431665420532, + "learning_rate": 2.1548074652172872e-06, + "loss": 0.6612, + "step": 8886 + }, + { + "epoch": 3.280580851587497, + "grad_norm": 0.9592177867889404, + "learning_rate": 2.1543257556127447e-06, + "loss": 0.596, + "step": 8887 + }, + { + "epoch": 3.280950036918533, + "grad_norm": 0.9115509390830994, + "learning_rate": 2.153844059091864e-06, + "loss": 0.5889, + "step": 8888 + }, + { + "epoch": 3.2813192222495693, + "grad_norm": 0.94068843126297, + "learning_rate": 2.1533623756728753e-06, + "loss": 0.5846, + "step": 8889 + }, + { + "epoch": 3.2816884075806056, + "grad_norm": 0.9308181405067444, + "learning_rate": 2.1528807053740124e-06, + "loss": 0.6018, + "step": 8890 + }, + { + "epoch": 3.2820575929116416, + "grad_norm": 0.9307516813278198, + "learning_rate": 2.1523990482135043e-06, + "loss": 0.5977, + "step": 8891 + }, + { + "epoch": 3.282426778242678, + "grad_norm": 0.9140745997428894, + "learning_rate": 2.1519174042095823e-06, + "loss": 0.5922, + "step": 8892 + }, + { + "epoch": 3.282795963573714, + "grad_norm": 0.908577024936676, + "learning_rate": 2.1514357733804782e-06, + "loss": 0.6059, + "step": 8893 + }, + { + "epoch": 3.28316514890475, + "grad_norm": 0.899254322052002, + "learning_rate": 2.1509541557444184e-06, + "loss": 0.6024, + "step": 8894 + }, + { + "epoch": 3.2835343342357866, + "grad_norm": 0.9217035174369812, + "learning_rate": 2.150472551319635e-06, + "loss": 0.5765, + "step": 8895 + }, + { + "epoch": 3.2839035195668225, + "grad_norm": 0.9237058758735657, + "learning_rate": 2.149990960124354e-06, + "loss": 0.5986, + "step": 8896 + }, + { + "epoch": 3.284272704897859, + "grad_norm": 0.9384385347366333, + "learning_rate": 2.1495093821768053e-06, + "loss": 0.5933, + "step": 8897 + }, + { + "epoch": 3.2846418902288947, + "grad_norm": 0.910790741443634, + "learning_rate": 2.1490278174952162e-06, + "loss": 0.5667, + "step": 8898 + }, + { + "epoch": 3.285011075559931, + "grad_norm": 0.9024767279624939, + "learning_rate": 2.148546266097813e-06, + "loss": 0.5598, + "step": 8899 + }, + { + "epoch": 3.2853802608909675, + "grad_norm": 0.9391133785247803, + "learning_rate": 2.1480647280028225e-06, + "loss": 0.5865, + "step": 8900 + }, + { + "epoch": 3.2857494462220034, + "grad_norm": 0.9015421867370605, + "learning_rate": 2.1475832032284716e-06, + "loss": 0.5739, + "step": 8901 + }, + { + "epoch": 3.2861186315530397, + "grad_norm": 0.914348304271698, + "learning_rate": 2.1471016917929842e-06, + "loss": 0.6122, + "step": 8902 + }, + { + "epoch": 3.2864878168840757, + "grad_norm": 0.9062631130218506, + "learning_rate": 2.146620193714588e-06, + "loss": 0.5627, + "step": 8903 + }, + { + "epoch": 3.286857002215112, + "grad_norm": 0.9251095056533813, + "learning_rate": 2.146138709011504e-06, + "loss": 0.6411, + "step": 8904 + }, + { + "epoch": 3.287226187546148, + "grad_norm": 0.8936845064163208, + "learning_rate": 2.1456572377019598e-06, + "loss": 0.5708, + "step": 8905 + }, + { + "epoch": 3.2875953728771843, + "grad_norm": 0.9470497369766235, + "learning_rate": 2.1451757798041766e-06, + "loss": 0.6021, + "step": 8906 + }, + { + "epoch": 3.2879645582082206, + "grad_norm": 0.9197751879692078, + "learning_rate": 2.1446943353363782e-06, + "loss": 0.579, + "step": 8907 + }, + { + "epoch": 3.2883337435392566, + "grad_norm": 0.9223349690437317, + "learning_rate": 2.1442129043167877e-06, + "loss": 0.5722, + "step": 8908 + }, + { + "epoch": 3.288702928870293, + "grad_norm": 0.9182333946228027, + "learning_rate": 2.1437314867636256e-06, + "loss": 0.5962, + "step": 8909 + }, + { + "epoch": 3.2890721142013293, + "grad_norm": 0.9287170171737671, + "learning_rate": 2.143250082695115e-06, + "loss": 0.5937, + "step": 8910 + }, + { + "epoch": 3.289441299532365, + "grad_norm": 0.9310613870620728, + "learning_rate": 2.1427686921294757e-06, + "loss": 0.6061, + "step": 8911 + }, + { + "epoch": 3.2898104848634016, + "grad_norm": 0.8786295652389526, + "learning_rate": 2.1422873150849282e-06, + "loss": 0.5887, + "step": 8912 + }, + { + "epoch": 3.2901796701944375, + "grad_norm": 0.9028212428092957, + "learning_rate": 2.141805951579694e-06, + "loss": 0.5377, + "step": 8913 + }, + { + "epoch": 3.290548855525474, + "grad_norm": 0.920509934425354, + "learning_rate": 2.1413246016319907e-06, + "loss": 0.5798, + "step": 8914 + }, + { + "epoch": 3.2909180408565097, + "grad_norm": 0.921145498752594, + "learning_rate": 2.140843265260038e-06, + "loss": 0.6046, + "step": 8915 + }, + { + "epoch": 3.291287226187546, + "grad_norm": 0.9909632205963135, + "learning_rate": 2.140361942482055e-06, + "loss": 0.5983, + "step": 8916 + }, + { + "epoch": 3.2916564115185825, + "grad_norm": 0.8954960703849792, + "learning_rate": 2.139880633316259e-06, + "loss": 0.5715, + "step": 8917 + }, + { + "epoch": 3.2920255968496184, + "grad_norm": 0.97716224193573, + "learning_rate": 2.139399337780867e-06, + "loss": 0.6055, + "step": 8918 + }, + { + "epoch": 3.2923947821806547, + "grad_norm": 0.8823646903038025, + "learning_rate": 2.138918055894096e-06, + "loss": 0.5949, + "step": 8919 + }, + { + "epoch": 3.292763967511691, + "grad_norm": 0.9171161651611328, + "learning_rate": 2.138436787674163e-06, + "loss": 0.616, + "step": 8920 + }, + { + "epoch": 3.293133152842727, + "grad_norm": 0.9085773229598999, + "learning_rate": 2.1379555331392836e-06, + "loss": 0.5809, + "step": 8921 + }, + { + "epoch": 3.2935023381737634, + "grad_norm": 0.9292853474617004, + "learning_rate": 2.1374742923076722e-06, + "loss": 0.6279, + "step": 8922 + }, + { + "epoch": 3.2938715235047993, + "grad_norm": 0.9044289588928223, + "learning_rate": 2.1369930651975456e-06, + "loss": 0.5736, + "step": 8923 + }, + { + "epoch": 3.2942407088358356, + "grad_norm": 0.9582463502883911, + "learning_rate": 2.136511851827116e-06, + "loss": 0.5988, + "step": 8924 + }, + { + "epoch": 3.2946098941668716, + "grad_norm": 0.9235432147979736, + "learning_rate": 2.136030652214598e-06, + "loss": 0.5885, + "step": 8925 + }, + { + "epoch": 3.294979079497908, + "grad_norm": 0.9084847569465637, + "learning_rate": 2.135549466378206e-06, + "loss": 0.5729, + "step": 8926 + }, + { + "epoch": 3.2953482648289443, + "grad_norm": 0.9409551620483398, + "learning_rate": 2.1350682943361506e-06, + "loss": 0.5996, + "step": 8927 + }, + { + "epoch": 3.29571745015998, + "grad_norm": 0.9265755414962769, + "learning_rate": 2.134587136106646e-06, + "loss": 0.5905, + "step": 8928 + }, + { + "epoch": 3.2960866354910165, + "grad_norm": 0.9086862206459045, + "learning_rate": 2.134105991707902e-06, + "loss": 0.5628, + "step": 8929 + }, + { + "epoch": 3.296455820822053, + "grad_norm": 0.9735234379768372, + "learning_rate": 2.133624861158131e-06, + "loss": 0.6087, + "step": 8930 + }, + { + "epoch": 3.296825006153089, + "grad_norm": 0.9336267709732056, + "learning_rate": 2.1331437444755444e-06, + "loss": 0.6185, + "step": 8931 + }, + { + "epoch": 3.297194191484125, + "grad_norm": 0.9076899290084839, + "learning_rate": 2.1326626416783496e-06, + "loss": 0.5972, + "step": 8932 + }, + { + "epoch": 3.297563376815161, + "grad_norm": 0.9083214998245239, + "learning_rate": 2.1321815527847596e-06, + "loss": 0.5382, + "step": 8933 + }, + { + "epoch": 3.2979325621461975, + "grad_norm": 0.9529834985733032, + "learning_rate": 2.1317004778129803e-06, + "loss": 0.6054, + "step": 8934 + }, + { + "epoch": 3.2983017474772334, + "grad_norm": 0.9303247332572937, + "learning_rate": 2.131219416781222e-06, + "loss": 0.6321, + "step": 8935 + }, + { + "epoch": 3.2986709328082697, + "grad_norm": 0.9113239645957947, + "learning_rate": 2.1307383697076934e-06, + "loss": 0.6003, + "step": 8936 + }, + { + "epoch": 3.299040118139306, + "grad_norm": 0.9327229261398315, + "learning_rate": 2.1302573366105998e-06, + "loss": 0.592, + "step": 8937 + }, + { + "epoch": 3.299409303470342, + "grad_norm": 0.901864767074585, + "learning_rate": 2.1297763175081498e-06, + "loss": 0.5556, + "step": 8938 + }, + { + "epoch": 3.2997784888013784, + "grad_norm": 0.9225730895996094, + "learning_rate": 2.1292953124185495e-06, + "loss": 0.5763, + "step": 8939 + }, + { + "epoch": 3.3001476741324147, + "grad_norm": 0.9041929841041565, + "learning_rate": 2.1288143213600043e-06, + "loss": 0.5561, + "step": 8940 + }, + { + "epoch": 3.3005168594634506, + "grad_norm": 0.9453275799751282, + "learning_rate": 2.12833334435072e-06, + "loss": 0.6173, + "step": 8941 + }, + { + "epoch": 3.300886044794487, + "grad_norm": 0.874563455581665, + "learning_rate": 2.1278523814089013e-06, + "loss": 0.5548, + "step": 8942 + }, + { + "epoch": 3.301255230125523, + "grad_norm": 0.9226747751235962, + "learning_rate": 2.1273714325527524e-06, + "loss": 0.5841, + "step": 8943 + }, + { + "epoch": 3.3016244154565593, + "grad_norm": 0.9151285290718079, + "learning_rate": 2.126890497800477e-06, + "loss": 0.6006, + "step": 8944 + }, + { + "epoch": 3.301993600787595, + "grad_norm": 0.8903526663780212, + "learning_rate": 2.126409577170279e-06, + "loss": 0.594, + "step": 8945 + }, + { + "epoch": 3.3023627861186315, + "grad_norm": 0.926118016242981, + "learning_rate": 2.125928670680361e-06, + "loss": 0.5697, + "step": 8946 + }, + { + "epoch": 3.302731971449668, + "grad_norm": 0.9266758561134338, + "learning_rate": 2.1254477783489237e-06, + "loss": 0.5871, + "step": 8947 + }, + { + "epoch": 3.303101156780704, + "grad_norm": 0.9303613305091858, + "learning_rate": 2.1249669001941702e-06, + "loss": 0.584, + "step": 8948 + }, + { + "epoch": 3.30347034211174, + "grad_norm": 0.9253128170967102, + "learning_rate": 2.124486036234302e-06, + "loss": 0.5942, + "step": 8949 + }, + { + "epoch": 3.303839527442776, + "grad_norm": 0.9048087000846863, + "learning_rate": 2.1240051864875173e-06, + "loss": 0.5781, + "step": 8950 + }, + { + "epoch": 3.3042087127738125, + "grad_norm": 0.9561468362808228, + "learning_rate": 2.1235243509720187e-06, + "loss": 0.6249, + "step": 8951 + }, + { + "epoch": 3.304577898104849, + "grad_norm": 0.9269001483917236, + "learning_rate": 2.1230435297060037e-06, + "loss": 0.5902, + "step": 8952 + }, + { + "epoch": 3.3049470834358847, + "grad_norm": 0.9026976227760315, + "learning_rate": 2.1225627227076726e-06, + "loss": 0.58, + "step": 8953 + }, + { + "epoch": 3.305316268766921, + "grad_norm": 0.9562625885009766, + "learning_rate": 2.122081929995224e-06, + "loss": 0.5857, + "step": 8954 + }, + { + "epoch": 3.305685454097957, + "grad_norm": 0.9410803318023682, + "learning_rate": 2.121601151586854e-06, + "loss": 0.6156, + "step": 8955 + }, + { + "epoch": 3.3060546394289934, + "grad_norm": 0.9219542145729065, + "learning_rate": 2.121120387500762e-06, + "loss": 0.6085, + "step": 8956 + }, + { + "epoch": 3.3064238247600297, + "grad_norm": 0.8787244558334351, + "learning_rate": 2.120639637755143e-06, + "loss": 0.5251, + "step": 8957 + }, + { + "epoch": 3.3067930100910656, + "grad_norm": 0.9032600522041321, + "learning_rate": 2.120158902368194e-06, + "loss": 0.5942, + "step": 8958 + }, + { + "epoch": 3.307162195422102, + "grad_norm": 0.8999313116073608, + "learning_rate": 2.1196781813581113e-06, + "loss": 0.5816, + "step": 8959 + }, + { + "epoch": 3.307531380753138, + "grad_norm": 0.9222333431243896, + "learning_rate": 2.119197474743089e-06, + "loss": 0.6145, + "step": 8960 + }, + { + "epoch": 3.3079005660841743, + "grad_norm": 0.9175475239753723, + "learning_rate": 2.118716782541322e-06, + "loss": 0.5976, + "step": 8961 + }, + { + "epoch": 3.3082697514152106, + "grad_norm": 0.9351820945739746, + "learning_rate": 2.118236104771004e-06, + "loss": 0.5827, + "step": 8962 + }, + { + "epoch": 3.3086389367462465, + "grad_norm": 0.9346526265144348, + "learning_rate": 2.117755441450329e-06, + "loss": 0.6392, + "step": 8963 + }, + { + "epoch": 3.309008122077283, + "grad_norm": 0.9122166037559509, + "learning_rate": 2.117274792597491e-06, + "loss": 0.6063, + "step": 8964 + }, + { + "epoch": 3.309377307408319, + "grad_norm": 0.9220876693725586, + "learning_rate": 2.1167941582306805e-06, + "loss": 0.601, + "step": 8965 + }, + { + "epoch": 3.309746492739355, + "grad_norm": 0.9445088505744934, + "learning_rate": 2.1163135383680906e-06, + "loss": 0.6041, + "step": 8966 + }, + { + "epoch": 3.3101156780703915, + "grad_norm": 0.9007879495620728, + "learning_rate": 2.115832933027912e-06, + "loss": 0.5583, + "step": 8967 + }, + { + "epoch": 3.3104848634014274, + "grad_norm": 0.9119090437889099, + "learning_rate": 2.1153523422283353e-06, + "loss": 0.5715, + "step": 8968 + }, + { + "epoch": 3.310854048732464, + "grad_norm": 0.9007024765014648, + "learning_rate": 2.1148717659875517e-06, + "loss": 0.5591, + "step": 8969 + }, + { + "epoch": 3.3112232340634997, + "grad_norm": 0.9209088087081909, + "learning_rate": 2.11439120432375e-06, + "loss": 0.605, + "step": 8970 + }, + { + "epoch": 3.311592419394536, + "grad_norm": 0.9339796900749207, + "learning_rate": 2.1139106572551185e-06, + "loss": 0.5887, + "step": 8971 + }, + { + "epoch": 3.311961604725572, + "grad_norm": 0.9369813799858093, + "learning_rate": 2.1134301247998477e-06, + "loss": 0.6098, + "step": 8972 + }, + { + "epoch": 3.3123307900566084, + "grad_norm": 0.9146403670310974, + "learning_rate": 2.1129496069761245e-06, + "loss": 0.5644, + "step": 8973 + }, + { + "epoch": 3.3126999753876447, + "grad_norm": 0.9721027612686157, + "learning_rate": 2.112469103802137e-06, + "loss": 0.6001, + "step": 8974 + }, + { + "epoch": 3.3130691607186806, + "grad_norm": 0.9184406399726868, + "learning_rate": 2.1119886152960712e-06, + "loss": 0.5978, + "step": 8975 + }, + { + "epoch": 3.313438346049717, + "grad_norm": 0.9004332423210144, + "learning_rate": 2.1115081414761136e-06, + "loss": 0.5546, + "step": 8976 + }, + { + "epoch": 3.3138075313807533, + "grad_norm": 0.9188088178634644, + "learning_rate": 2.111027682360451e-06, + "loss": 0.6017, + "step": 8977 + }, + { + "epoch": 3.3141767167117893, + "grad_norm": 0.9534095525741577, + "learning_rate": 2.1105472379672673e-06, + "loss": 0.6007, + "step": 8978 + }, + { + "epoch": 3.3145459020428256, + "grad_norm": 0.8853995203971863, + "learning_rate": 2.1100668083147483e-06, + "loss": 0.5669, + "step": 8979 + }, + { + "epoch": 3.3149150873738615, + "grad_norm": 0.9111990928649902, + "learning_rate": 2.1095863934210768e-06, + "loss": 0.597, + "step": 8980 + }, + { + "epoch": 3.315284272704898, + "grad_norm": 0.9641180038452148, + "learning_rate": 2.1091059933044367e-06, + "loss": 0.601, + "step": 8981 + }, + { + "epoch": 3.315653458035934, + "grad_norm": 0.9247210621833801, + "learning_rate": 2.108625607983012e-06, + "loss": 0.6123, + "step": 8982 + }, + { + "epoch": 3.31602264336697, + "grad_norm": 0.9159184694290161, + "learning_rate": 2.1081452374749846e-06, + "loss": 0.5707, + "step": 8983 + }, + { + "epoch": 3.3163918286980065, + "grad_norm": 0.9120333790779114, + "learning_rate": 2.1076648817985366e-06, + "loss": 0.5728, + "step": 8984 + }, + { + "epoch": 3.3167610140290424, + "grad_norm": 0.9419568181037903, + "learning_rate": 2.1071845409718485e-06, + "loss": 0.5863, + "step": 8985 + }, + { + "epoch": 3.317130199360079, + "grad_norm": 0.9299066662788391, + "learning_rate": 2.1067042150131013e-06, + "loss": 0.5976, + "step": 8986 + }, + { + "epoch": 3.317499384691115, + "grad_norm": 0.9283381104469299, + "learning_rate": 2.106223903940476e-06, + "loss": 0.6127, + "step": 8987 + }, + { + "epoch": 3.317868570022151, + "grad_norm": 0.892882227897644, + "learning_rate": 2.1057436077721518e-06, + "loss": 0.5754, + "step": 8988 + }, + { + "epoch": 3.3182377553531874, + "grad_norm": 0.9357994198799133, + "learning_rate": 2.1052633265263075e-06, + "loss": 0.6043, + "step": 8989 + }, + { + "epoch": 3.3186069406842233, + "grad_norm": 0.918321967124939, + "learning_rate": 2.1047830602211215e-06, + "loss": 0.5842, + "step": 8990 + }, + { + "epoch": 3.3189761260152597, + "grad_norm": 0.9550317525863647, + "learning_rate": 2.1043028088747715e-06, + "loss": 0.6133, + "step": 8991 + }, + { + "epoch": 3.3193453113462956, + "grad_norm": 0.9344510436058044, + "learning_rate": 2.103822572505436e-06, + "loss": 0.5797, + "step": 8992 + }, + { + "epoch": 3.319714496677332, + "grad_norm": 0.9068375825881958, + "learning_rate": 2.1033423511312908e-06, + "loss": 0.561, + "step": 8993 + }, + { + "epoch": 3.3200836820083683, + "grad_norm": 0.9455716013908386, + "learning_rate": 2.1028621447705125e-06, + "loss": 0.5938, + "step": 8994 + }, + { + "epoch": 3.3204528673394043, + "grad_norm": 0.9435574412345886, + "learning_rate": 2.1023819534412767e-06, + "loss": 0.5767, + "step": 8995 + }, + { + "epoch": 3.3208220526704406, + "grad_norm": 0.95679771900177, + "learning_rate": 2.101901777161758e-06, + "loss": 0.5853, + "step": 8996 + }, + { + "epoch": 3.321191238001477, + "grad_norm": 0.9250844717025757, + "learning_rate": 2.101421615950132e-06, + "loss": 0.5877, + "step": 8997 + }, + { + "epoch": 3.321560423332513, + "grad_norm": 0.9162894487380981, + "learning_rate": 2.100941469824572e-06, + "loss": 0.5648, + "step": 8998 + }, + { + "epoch": 3.3219296086635492, + "grad_norm": 0.9507448673248291, + "learning_rate": 2.1004613388032517e-06, + "loss": 0.6068, + "step": 8999 + }, + { + "epoch": 3.322298793994585, + "grad_norm": 0.9005255699157715, + "learning_rate": 2.0999812229043433e-06, + "loss": 0.5859, + "step": 9000 + }, + { + "epoch": 3.3226679793256215, + "grad_norm": 0.9397231340408325, + "learning_rate": 2.0995011221460183e-06, + "loss": 0.5646, + "step": 9001 + }, + { + "epoch": 3.3230371646566574, + "grad_norm": 0.9542262554168701, + "learning_rate": 2.099021036546451e-06, + "loss": 0.5889, + "step": 9002 + }, + { + "epoch": 3.323406349987694, + "grad_norm": 0.9221029281616211, + "learning_rate": 2.09854096612381e-06, + "loss": 0.5676, + "step": 9003 + }, + { + "epoch": 3.32377553531873, + "grad_norm": 0.9345236420631409, + "learning_rate": 2.0980609108962673e-06, + "loss": 0.5864, + "step": 9004 + }, + { + "epoch": 3.324144720649766, + "grad_norm": 0.9503288865089417, + "learning_rate": 2.0975808708819916e-06, + "loss": 0.6186, + "step": 9005 + }, + { + "epoch": 3.3245139059808024, + "grad_norm": 0.9232698678970337, + "learning_rate": 2.0971008460991534e-06, + "loss": 0.6017, + "step": 9006 + }, + { + "epoch": 3.324883091311839, + "grad_norm": 0.9547373056411743, + "learning_rate": 2.0966208365659215e-06, + "loss": 0.6007, + "step": 9007 + }, + { + "epoch": 3.3252522766428747, + "grad_norm": 0.9309756755828857, + "learning_rate": 2.0961408423004628e-06, + "loss": 0.5987, + "step": 9008 + }, + { + "epoch": 3.325621461973911, + "grad_norm": 0.8778833746910095, + "learning_rate": 2.095660863320945e-06, + "loss": 0.5657, + "step": 9009 + }, + { + "epoch": 3.325990647304947, + "grad_norm": 0.90522700548172, + "learning_rate": 2.095180899645538e-06, + "loss": 0.6157, + "step": 9010 + }, + { + "epoch": 3.3263598326359833, + "grad_norm": 0.9299862384796143, + "learning_rate": 2.0947009512924044e-06, + "loss": 0.6137, + "step": 9011 + }, + { + "epoch": 3.3267290179670193, + "grad_norm": 0.9795272350311279, + "learning_rate": 2.094221018279713e-06, + "loss": 0.6018, + "step": 9012 + }, + { + "epoch": 3.3270982032980556, + "grad_norm": 0.8987348675727844, + "learning_rate": 2.0937411006256274e-06, + "loss": 0.6254, + "step": 9013 + }, + { + "epoch": 3.327467388629092, + "grad_norm": 1.0266636610031128, + "learning_rate": 2.093261198348313e-06, + "loss": 0.5818, + "step": 9014 + }, + { + "epoch": 3.327836573960128, + "grad_norm": 0.8902499079704285, + "learning_rate": 2.0927813114659344e-06, + "loss": 0.59, + "step": 9015 + }, + { + "epoch": 3.3282057592911642, + "grad_norm": 0.9331735968589783, + "learning_rate": 2.0923014399966542e-06, + "loss": 0.5916, + "step": 9016 + }, + { + "epoch": 3.3285749446222, + "grad_norm": 0.933070182800293, + "learning_rate": 2.0918215839586364e-06, + "loss": 0.599, + "step": 9017 + }, + { + "epoch": 3.3289441299532365, + "grad_norm": 0.9264621138572693, + "learning_rate": 2.091341743370042e-06, + "loss": 0.5933, + "step": 9018 + }, + { + "epoch": 3.329313315284273, + "grad_norm": 0.9275625348091125, + "learning_rate": 2.0908619182490335e-06, + "loss": 0.6201, + "step": 9019 + }, + { + "epoch": 3.329682500615309, + "grad_norm": 0.9636704325675964, + "learning_rate": 2.090382108613773e-06, + "loss": 0.587, + "step": 9020 + }, + { + "epoch": 3.330051685946345, + "grad_norm": 0.9071925282478333, + "learning_rate": 2.08990231448242e-06, + "loss": 0.5915, + "step": 9021 + }, + { + "epoch": 3.330420871277381, + "grad_norm": 0.9675769209861755, + "learning_rate": 2.0894225358731355e-06, + "loss": 0.5797, + "step": 9022 + }, + { + "epoch": 3.3307900566084174, + "grad_norm": 0.9348405599594116, + "learning_rate": 2.088942772804078e-06, + "loss": 0.584, + "step": 9023 + }, + { + "epoch": 3.331159241939454, + "grad_norm": 0.9345868229866028, + "learning_rate": 2.0884630252934067e-06, + "loss": 0.6114, + "step": 9024 + }, + { + "epoch": 3.3315284272704897, + "grad_norm": 0.9212341904640198, + "learning_rate": 2.0879832933592805e-06, + "loss": 0.5715, + "step": 9025 + }, + { + "epoch": 3.331897612601526, + "grad_norm": 0.9298580884933472, + "learning_rate": 2.087503577019856e-06, + "loss": 0.5819, + "step": 9026 + }, + { + "epoch": 3.332266797932562, + "grad_norm": 0.937308132648468, + "learning_rate": 2.0870238762932914e-06, + "loss": 0.5849, + "step": 9027 + }, + { + "epoch": 3.3326359832635983, + "grad_norm": 0.9240490794181824, + "learning_rate": 2.0865441911977425e-06, + "loss": 0.6033, + "step": 9028 + }, + { + "epoch": 3.3330051685946347, + "grad_norm": 0.9257277250289917, + "learning_rate": 2.0860645217513646e-06, + "loss": 0.5842, + "step": 9029 + }, + { + "epoch": 3.3333743539256706, + "grad_norm": 0.9520443081855774, + "learning_rate": 2.0855848679723153e-06, + "loss": 0.6204, + "step": 9030 + }, + { + "epoch": 3.333743539256707, + "grad_norm": 0.9077940583229065, + "learning_rate": 2.085105229878747e-06, + "loss": 0.5824, + "step": 9031 + }, + { + "epoch": 3.334112724587743, + "grad_norm": 0.9350326061248779, + "learning_rate": 2.084625607488816e-06, + "loss": 0.6053, + "step": 9032 + }, + { + "epoch": 3.3344819099187792, + "grad_norm": 0.9242446422576904, + "learning_rate": 2.084146000820674e-06, + "loss": 0.5929, + "step": 9033 + }, + { + "epoch": 3.3348510952498156, + "grad_norm": 0.9053428769111633, + "learning_rate": 2.0836664098924742e-06, + "loss": 0.5813, + "step": 9034 + }, + { + "epoch": 3.3352202805808515, + "grad_norm": 0.9272119998931885, + "learning_rate": 2.08318683472237e-06, + "loss": 0.6263, + "step": 9035 + }, + { + "epoch": 3.335589465911888, + "grad_norm": 0.9262641072273254, + "learning_rate": 2.082707275328512e-06, + "loss": 0.5898, + "step": 9036 + }, + { + "epoch": 3.335958651242924, + "grad_norm": 0.9271035194396973, + "learning_rate": 2.0822277317290528e-06, + "loss": 0.6008, + "step": 9037 + }, + { + "epoch": 3.33632783657396, + "grad_norm": 0.9110671877861023, + "learning_rate": 2.0817482039421416e-06, + "loss": 0.5811, + "step": 9038 + }, + { + "epoch": 3.336697021904996, + "grad_norm": 0.9297341704368591, + "learning_rate": 2.081268691985928e-06, + "loss": 0.5638, + "step": 9039 + }, + { + "epoch": 3.3370662072360324, + "grad_norm": 0.9385644793510437, + "learning_rate": 2.0807891958785637e-06, + "loss": 0.6414, + "step": 9040 + }, + { + "epoch": 3.337435392567069, + "grad_norm": 0.9138168096542358, + "learning_rate": 2.0803097156381953e-06, + "loss": 0.5665, + "step": 9041 + }, + { + "epoch": 3.3378045778981047, + "grad_norm": 0.9577962160110474, + "learning_rate": 2.0798302512829722e-06, + "loss": 0.6084, + "step": 9042 + }, + { + "epoch": 3.338173763229141, + "grad_norm": 0.9088366627693176, + "learning_rate": 2.079350802831041e-06, + "loss": 0.5863, + "step": 9043 + }, + { + "epoch": 3.3385429485601774, + "grad_norm": 0.9366892576217651, + "learning_rate": 2.0788713703005487e-06, + "loss": 0.5935, + "step": 9044 + }, + { + "epoch": 3.3389121338912133, + "grad_norm": 0.9227306842803955, + "learning_rate": 2.078391953709643e-06, + "loss": 0.5749, + "step": 9045 + }, + { + "epoch": 3.3392813192222497, + "grad_norm": 0.9372220635414124, + "learning_rate": 2.0779125530764686e-06, + "loss": 0.5724, + "step": 9046 + }, + { + "epoch": 3.3396505045532856, + "grad_norm": 0.8894624710083008, + "learning_rate": 2.0774331684191698e-06, + "loss": 0.5811, + "step": 9047 + }, + { + "epoch": 3.340019689884322, + "grad_norm": 0.941307544708252, + "learning_rate": 2.0769537997558943e-06, + "loss": 0.5899, + "step": 9048 + }, + { + "epoch": 3.340388875215358, + "grad_norm": 0.908374011516571, + "learning_rate": 2.076474447104782e-06, + "loss": 0.5454, + "step": 9049 + }, + { + "epoch": 3.3407580605463942, + "grad_norm": 0.9134104251861572, + "learning_rate": 2.0759951104839793e-06, + "loss": 0.5969, + "step": 9050 + }, + { + "epoch": 3.3411272458774306, + "grad_norm": 0.9503081440925598, + "learning_rate": 2.0755157899116275e-06, + "loss": 0.5997, + "step": 9051 + }, + { + "epoch": 3.3414964312084665, + "grad_norm": 0.8730022311210632, + "learning_rate": 2.0750364854058687e-06, + "loss": 0.5472, + "step": 9052 + }, + { + "epoch": 3.341865616539503, + "grad_norm": 0.9379123449325562, + "learning_rate": 2.074557196984846e-06, + "loss": 0.5991, + "step": 9053 + }, + { + "epoch": 3.3422348018705392, + "grad_norm": 0.9106814861297607, + "learning_rate": 2.074077924666698e-06, + "loss": 0.5727, + "step": 9054 + }, + { + "epoch": 3.342603987201575, + "grad_norm": 0.9206032156944275, + "learning_rate": 2.073598668469567e-06, + "loss": 0.5833, + "step": 9055 + }, + { + "epoch": 3.3429731725326115, + "grad_norm": 0.9116297960281372, + "learning_rate": 2.073119428411591e-06, + "loss": 0.5892, + "step": 9056 + }, + { + "epoch": 3.3433423578636474, + "grad_norm": 0.933799684047699, + "learning_rate": 2.0726402045109095e-06, + "loss": 0.5883, + "step": 9057 + }, + { + "epoch": 3.3437115431946838, + "grad_norm": 0.8927928805351257, + "learning_rate": 2.072160996785663e-06, + "loss": 0.5603, + "step": 9058 + }, + { + "epoch": 3.3440807285257197, + "grad_norm": 0.9320164918899536, + "learning_rate": 2.071681805253986e-06, + "loss": 0.5919, + "step": 9059 + }, + { + "epoch": 3.344449913856756, + "grad_norm": 0.9217719435691833, + "learning_rate": 2.071202629934019e-06, + "loss": 0.6026, + "step": 9060 + }, + { + "epoch": 3.3448190991877924, + "grad_norm": 0.8844900727272034, + "learning_rate": 2.0707234708438965e-06, + "loss": 0.5291, + "step": 9061 + }, + { + "epoch": 3.3451882845188283, + "grad_norm": 0.9420340061187744, + "learning_rate": 2.0702443280017546e-06, + "loss": 0.6105, + "step": 9062 + }, + { + "epoch": 3.3455574698498647, + "grad_norm": 0.9038479328155518, + "learning_rate": 2.06976520142573e-06, + "loss": 0.614, + "step": 9063 + }, + { + "epoch": 3.345926655180901, + "grad_norm": 0.9125717878341675, + "learning_rate": 2.069286091133957e-06, + "loss": 0.5946, + "step": 9064 + }, + { + "epoch": 3.346295840511937, + "grad_norm": 0.9324728846549988, + "learning_rate": 2.068806997144569e-06, + "loss": 0.5702, + "step": 9065 + }, + { + "epoch": 3.3466650258429733, + "grad_norm": 0.9453551173210144, + "learning_rate": 2.0683279194757e-06, + "loss": 0.6078, + "step": 9066 + }, + { + "epoch": 3.3470342111740092, + "grad_norm": 0.9516147971153259, + "learning_rate": 2.0678488581454825e-06, + "loss": 0.6563, + "step": 9067 + }, + { + "epoch": 3.3474033965050456, + "grad_norm": 0.9289828538894653, + "learning_rate": 2.0673698131720506e-06, + "loss": 0.5828, + "step": 9068 + }, + { + "epoch": 3.3477725818360815, + "grad_norm": 0.9254622459411621, + "learning_rate": 2.0668907845735337e-06, + "loss": 0.5687, + "step": 9069 + }, + { + "epoch": 3.348141767167118, + "grad_norm": 0.9279677271842957, + "learning_rate": 2.0664117723680648e-06, + "loss": 0.6043, + "step": 9070 + }, + { + "epoch": 3.3485109524981542, + "grad_norm": 0.9272057414054871, + "learning_rate": 2.0659327765737724e-06, + "loss": 0.5779, + "step": 9071 + }, + { + "epoch": 3.34888013782919, + "grad_norm": 0.913517951965332, + "learning_rate": 2.065453797208788e-06, + "loss": 0.5702, + "step": 9072 + }, + { + "epoch": 3.3492493231602265, + "grad_norm": 0.9534525871276855, + "learning_rate": 2.0649748342912405e-06, + "loss": 0.5925, + "step": 9073 + }, + { + "epoch": 3.349618508491263, + "grad_norm": 0.9332245588302612, + "learning_rate": 2.064495887839258e-06, + "loss": 0.5951, + "step": 9074 + }, + { + "epoch": 3.3499876938222988, + "grad_norm": 0.9480438828468323, + "learning_rate": 2.064016957870969e-06, + "loss": 0.6008, + "step": 9075 + }, + { + "epoch": 3.350356879153335, + "grad_norm": 0.9239978194236755, + "learning_rate": 2.0635380444045e-06, + "loss": 0.5722, + "step": 9076 + }, + { + "epoch": 3.350726064484371, + "grad_norm": 0.8916158080101013, + "learning_rate": 2.0630591474579775e-06, + "loss": 0.5832, + "step": 9077 + }, + { + "epoch": 3.3510952498154074, + "grad_norm": 0.9455827474594116, + "learning_rate": 2.0625802670495303e-06, + "loss": 0.6011, + "step": 9078 + }, + { + "epoch": 3.3514644351464433, + "grad_norm": 0.9070374965667725, + "learning_rate": 2.06210140319728e-06, + "loss": 0.5691, + "step": 9079 + }, + { + "epoch": 3.3518336204774797, + "grad_norm": 0.9512082934379578, + "learning_rate": 2.0616225559193552e-06, + "loss": 0.6197, + "step": 9080 + }, + { + "epoch": 3.352202805808516, + "grad_norm": 0.9258506894111633, + "learning_rate": 2.061143725233877e-06, + "loss": 0.6117, + "step": 9081 + }, + { + "epoch": 3.352571991139552, + "grad_norm": 0.8929771184921265, + "learning_rate": 2.0606649111589706e-06, + "loss": 0.5509, + "step": 9082 + }, + { + "epoch": 3.3529411764705883, + "grad_norm": 0.9448169469833374, + "learning_rate": 2.0601861137127593e-06, + "loss": 0.5979, + "step": 9083 + }, + { + "epoch": 3.3533103618016242, + "grad_norm": 0.9282087087631226, + "learning_rate": 2.059707332913364e-06, + "loss": 0.6127, + "step": 9084 + }, + { + "epoch": 3.3536795471326606, + "grad_norm": 0.9188057780265808, + "learning_rate": 2.059228568778907e-06, + "loss": 0.5752, + "step": 9085 + }, + { + "epoch": 3.354048732463697, + "grad_norm": 0.96979820728302, + "learning_rate": 2.058749821327511e-06, + "loss": 0.6145, + "step": 9086 + }, + { + "epoch": 3.354417917794733, + "grad_norm": 1.0185630321502686, + "learning_rate": 2.058271090577293e-06, + "loss": 0.582, + "step": 9087 + }, + { + "epoch": 3.354787103125769, + "grad_norm": 0.8927154541015625, + "learning_rate": 2.057792376546377e-06, + "loss": 0.5507, + "step": 9088 + }, + { + "epoch": 3.355156288456805, + "grad_norm": 0.9096875786781311, + "learning_rate": 2.0573136792528782e-06, + "loss": 0.5385, + "step": 9089 + }, + { + "epoch": 3.3555254737878415, + "grad_norm": 0.9400303959846497, + "learning_rate": 2.056834998714917e-06, + "loss": 0.6201, + "step": 9090 + }, + { + "epoch": 3.355894659118878, + "grad_norm": 0.9479248523712158, + "learning_rate": 2.0563563349506125e-06, + "loss": 0.6121, + "step": 9091 + }, + { + "epoch": 3.3562638444499138, + "grad_norm": 0.9318063855171204, + "learning_rate": 2.0558776879780797e-06, + "loss": 0.5978, + "step": 9092 + }, + { + "epoch": 3.35663302978095, + "grad_norm": 0.9380689263343811, + "learning_rate": 2.055399057815437e-06, + "loss": 0.6036, + "step": 9093 + }, + { + "epoch": 3.357002215111986, + "grad_norm": 0.9113141894340515, + "learning_rate": 2.0549204444807993e-06, + "loss": 0.6026, + "step": 9094 + }, + { + "epoch": 3.3573714004430224, + "grad_norm": 0.928409218788147, + "learning_rate": 2.0544418479922814e-06, + "loss": 0.5996, + "step": 9095 + }, + { + "epoch": 3.3577405857740588, + "grad_norm": 0.9050112962722778, + "learning_rate": 2.0539632683680006e-06, + "loss": 0.5751, + "step": 9096 + }, + { + "epoch": 3.3581097711050947, + "grad_norm": 0.8890694975852966, + "learning_rate": 2.0534847056260677e-06, + "loss": 0.5946, + "step": 9097 + }, + { + "epoch": 3.358478956436131, + "grad_norm": 0.9008944630622864, + "learning_rate": 2.0530061597845995e-06, + "loss": 0.5676, + "step": 9098 + }, + { + "epoch": 3.358848141767167, + "grad_norm": 0.9099285006523132, + "learning_rate": 2.052527630861705e-06, + "loss": 0.5782, + "step": 9099 + }, + { + "epoch": 3.3592173270982033, + "grad_norm": 0.9474815726280212, + "learning_rate": 2.0520491188754997e-06, + "loss": 0.6138, + "step": 9100 + }, + { + "epoch": 3.3595865124292397, + "grad_norm": 0.9110016822814941, + "learning_rate": 2.051570623844094e-06, + "loss": 0.5542, + "step": 9101 + }, + { + "epoch": 3.3599556977602756, + "grad_norm": 0.9189659953117371, + "learning_rate": 2.051092145785598e-06, + "loss": 0.5566, + "step": 9102 + }, + { + "epoch": 3.360324883091312, + "grad_norm": 0.9396629929542542, + "learning_rate": 2.0506136847181234e-06, + "loss": 0.5828, + "step": 9103 + }, + { + "epoch": 3.360694068422348, + "grad_norm": 0.9165499806404114, + "learning_rate": 2.0501352406597785e-06, + "loss": 0.5933, + "step": 9104 + }, + { + "epoch": 3.361063253753384, + "grad_norm": 0.9009498953819275, + "learning_rate": 2.049656813628672e-06, + "loss": 0.5787, + "step": 9105 + }, + { + "epoch": 3.36143243908442, + "grad_norm": 0.9218095541000366, + "learning_rate": 2.049178403642915e-06, + "loss": 0.6254, + "step": 9106 + }, + { + "epoch": 3.3618016244154565, + "grad_norm": 0.8956566452980042, + "learning_rate": 2.0487000107206114e-06, + "loss": 0.5856, + "step": 9107 + }, + { + "epoch": 3.362170809746493, + "grad_norm": 0.930472731590271, + "learning_rate": 2.048221634879871e-06, + "loss": 0.6142, + "step": 9108 + }, + { + "epoch": 3.3625399950775288, + "grad_norm": 0.9176258444786072, + "learning_rate": 2.0477432761387984e-06, + "loss": 0.5704, + "step": 9109 + }, + { + "epoch": 3.362909180408565, + "grad_norm": 0.9120811820030212, + "learning_rate": 2.0472649345155e-06, + "loss": 0.5513, + "step": 9110 + }, + { + "epoch": 3.3632783657396015, + "grad_norm": 0.9249758124351501, + "learning_rate": 2.0467866100280816e-06, + "loss": 0.5736, + "step": 9111 + }, + { + "epoch": 3.3636475510706374, + "grad_norm": 0.9334111213684082, + "learning_rate": 2.0463083026946467e-06, + "loss": 0.5919, + "step": 9112 + }, + { + "epoch": 3.3640167364016738, + "grad_norm": 0.911925196647644, + "learning_rate": 2.0458300125332996e-06, + "loss": 0.577, + "step": 9113 + }, + { + "epoch": 3.3643859217327097, + "grad_norm": 0.9031065702438354, + "learning_rate": 2.0453517395621426e-06, + "loss": 0.554, + "step": 9114 + }, + { + "epoch": 3.364755107063746, + "grad_norm": 0.9439833164215088, + "learning_rate": 2.0448734837992782e-06, + "loss": 0.5916, + "step": 9115 + }, + { + "epoch": 3.365124292394782, + "grad_norm": 0.9359933733940125, + "learning_rate": 2.044395245262811e-06, + "loss": 0.5949, + "step": 9116 + }, + { + "epoch": 3.3654934777258183, + "grad_norm": 1.043207049369812, + "learning_rate": 2.043917023970838e-06, + "loss": 0.607, + "step": 9117 + }, + { + "epoch": 3.3658626630568547, + "grad_norm": 0.9279747605323792, + "learning_rate": 2.043438819941463e-06, + "loss": 0.5985, + "step": 9118 + }, + { + "epoch": 3.3662318483878906, + "grad_norm": 0.9302000999450684, + "learning_rate": 2.042960633192783e-06, + "loss": 0.5781, + "step": 9119 + }, + { + "epoch": 3.366601033718927, + "grad_norm": 0.9081958532333374, + "learning_rate": 2.0424824637428995e-06, + "loss": 0.5834, + "step": 9120 + }, + { + "epoch": 3.3669702190499633, + "grad_norm": 0.9028515219688416, + "learning_rate": 2.0420043116099113e-06, + "loss": 0.5609, + "step": 9121 + }, + { + "epoch": 3.367339404380999, + "grad_norm": 0.9304929375648499, + "learning_rate": 2.0415261768119145e-06, + "loss": 0.6126, + "step": 9122 + }, + { + "epoch": 3.3677085897120356, + "grad_norm": 0.8730399012565613, + "learning_rate": 2.0410480593670062e-06, + "loss": 0.5352, + "step": 9123 + }, + { + "epoch": 3.3680777750430715, + "grad_norm": 0.9320747256278992, + "learning_rate": 2.0405699592932864e-06, + "loss": 0.5648, + "step": 9124 + }, + { + "epoch": 3.368446960374108, + "grad_norm": 0.9078342914581299, + "learning_rate": 2.0400918766088464e-06, + "loss": 0.6222, + "step": 9125 + }, + { + "epoch": 3.3688161457051438, + "grad_norm": 0.9357078671455383, + "learning_rate": 2.0396138113317855e-06, + "loss": 0.6212, + "step": 9126 + }, + { + "epoch": 3.36918533103618, + "grad_norm": 0.9372440576553345, + "learning_rate": 2.039135763480195e-06, + "loss": 0.6022, + "step": 9127 + }, + { + "epoch": 3.3695545163672165, + "grad_norm": 0.9140127897262573, + "learning_rate": 2.038657733072171e-06, + "loss": 0.5669, + "step": 9128 + }, + { + "epoch": 3.3699237016982524, + "grad_norm": 0.9093740582466125, + "learning_rate": 2.0381797201258068e-06, + "loss": 0.5595, + "step": 9129 + }, + { + "epoch": 3.3702928870292888, + "grad_norm": 0.9260631799697876, + "learning_rate": 2.0377017246591935e-06, + "loss": 0.6103, + "step": 9130 + }, + { + "epoch": 3.370662072360325, + "grad_norm": 0.9356569051742554, + "learning_rate": 2.037223746690425e-06, + "loss": 0.5925, + "step": 9131 + }, + { + "epoch": 3.371031257691361, + "grad_norm": 0.9236599206924438, + "learning_rate": 2.0367457862375912e-06, + "loss": 0.5808, + "step": 9132 + }, + { + "epoch": 3.3714004430223974, + "grad_norm": 0.9488097429275513, + "learning_rate": 2.0362678433187824e-06, + "loss": 0.6031, + "step": 9133 + }, + { + "epoch": 3.3717696283534333, + "grad_norm": 0.9610624313354492, + "learning_rate": 2.035789917952091e-06, + "loss": 0.6115, + "step": 9134 + }, + { + "epoch": 3.3721388136844697, + "grad_norm": 0.9466395974159241, + "learning_rate": 2.035312010155603e-06, + "loss": 0.6085, + "step": 9135 + }, + { + "epoch": 3.3725079990155056, + "grad_norm": 0.9021192193031311, + "learning_rate": 2.0348341199474108e-06, + "loss": 0.5667, + "step": 9136 + }, + { + "epoch": 3.372877184346542, + "grad_norm": 0.9355321526527405, + "learning_rate": 2.034356247345598e-06, + "loss": 0.599, + "step": 9137 + }, + { + "epoch": 3.3732463696775783, + "grad_norm": 0.9487738609313965, + "learning_rate": 2.0338783923682553e-06, + "loss": 0.5968, + "step": 9138 + }, + { + "epoch": 3.373615555008614, + "grad_norm": 0.9675102233886719, + "learning_rate": 2.0334005550334687e-06, + "loss": 0.6283, + "step": 9139 + }, + { + "epoch": 3.3739847403396506, + "grad_norm": 0.9253572225570679, + "learning_rate": 2.0329227353593233e-06, + "loss": 0.6402, + "step": 9140 + }, + { + "epoch": 3.374353925670687, + "grad_norm": 0.9179958701133728, + "learning_rate": 2.0324449333639057e-06, + "loss": 0.5955, + "step": 9141 + }, + { + "epoch": 3.374723111001723, + "grad_norm": 0.9380093812942505, + "learning_rate": 2.0319671490652994e-06, + "loss": 0.5686, + "step": 9142 + }, + { + "epoch": 3.375092296332759, + "grad_norm": 0.9541462063789368, + "learning_rate": 2.031489382481588e-06, + "loss": 0.5997, + "step": 9143 + }, + { + "epoch": 3.375461481663795, + "grad_norm": 0.9127100110054016, + "learning_rate": 2.031011633630857e-06, + "loss": 0.5946, + "step": 9144 + }, + { + "epoch": 3.3758306669948315, + "grad_norm": 0.9254124164581299, + "learning_rate": 2.0305339025311865e-06, + "loss": 0.5861, + "step": 9145 + }, + { + "epoch": 3.3761998523258674, + "grad_norm": 0.9216455221176147, + "learning_rate": 2.0300561892006605e-06, + "loss": 0.5685, + "step": 9146 + }, + { + "epoch": 3.3765690376569037, + "grad_norm": 0.9237782955169678, + "learning_rate": 2.0295784936573584e-06, + "loss": 0.5801, + "step": 9147 + }, + { + "epoch": 3.37693822298794, + "grad_norm": 0.9143710136413574, + "learning_rate": 2.0291008159193626e-06, + "loss": 0.5902, + "step": 9148 + }, + { + "epoch": 3.377307408318976, + "grad_norm": 0.9133440256118774, + "learning_rate": 2.0286231560047524e-06, + "loss": 0.5632, + "step": 9149 + }, + { + "epoch": 3.3776765936500124, + "grad_norm": 0.9192723035812378, + "learning_rate": 2.0281455139316066e-06, + "loss": 0.5972, + "step": 9150 + }, + { + "epoch": 3.3780457789810483, + "grad_norm": 0.9374421834945679, + "learning_rate": 2.027667889718005e-06, + "loss": 0.6052, + "step": 9151 + }, + { + "epoch": 3.3784149643120847, + "grad_norm": 0.9640692472457886, + "learning_rate": 2.0271902833820236e-06, + "loss": 0.6441, + "step": 9152 + }, + { + "epoch": 3.378784149643121, + "grad_norm": 0.9231659770011902, + "learning_rate": 2.0267126949417402e-06, + "loss": 0.5441, + "step": 9153 + }, + { + "epoch": 3.379153334974157, + "grad_norm": 0.9600130319595337, + "learning_rate": 2.026235124415234e-06, + "loss": 0.5918, + "step": 9154 + }, + { + "epoch": 3.3795225203051933, + "grad_norm": 0.9261066913604736, + "learning_rate": 2.025757571820577e-06, + "loss": 0.6123, + "step": 9155 + }, + { + "epoch": 3.379891705636229, + "grad_norm": 0.913201093673706, + "learning_rate": 2.0252800371758464e-06, + "loss": 0.5873, + "step": 9156 + }, + { + "epoch": 3.3802608909672656, + "grad_norm": 0.9212847352027893, + "learning_rate": 2.024802520499118e-06, + "loss": 0.5687, + "step": 9157 + }, + { + "epoch": 3.380630076298302, + "grad_norm": 0.9482429027557373, + "learning_rate": 2.0243250218084627e-06, + "loss": 0.6332, + "step": 9158 + }, + { + "epoch": 3.380999261629338, + "grad_norm": 0.9556758999824524, + "learning_rate": 2.0238475411219566e-06, + "loss": 0.6095, + "step": 9159 + }, + { + "epoch": 3.381368446960374, + "grad_norm": 0.9434055685997009, + "learning_rate": 2.0233700784576698e-06, + "loss": 0.6071, + "step": 9160 + }, + { + "epoch": 3.38173763229141, + "grad_norm": 0.9221524596214294, + "learning_rate": 2.0228926338336752e-06, + "loss": 0.5496, + "step": 9161 + }, + { + "epoch": 3.3821068176224465, + "grad_norm": 0.9051884412765503, + "learning_rate": 2.022415207268044e-06, + "loss": 0.5783, + "step": 9162 + }, + { + "epoch": 3.382476002953483, + "grad_norm": 0.9457894563674927, + "learning_rate": 2.0219377987788457e-06, + "loss": 0.6099, + "step": 9163 + }, + { + "epoch": 3.3828451882845187, + "grad_norm": 0.9309843182563782, + "learning_rate": 2.0214604083841527e-06, + "loss": 0.5848, + "step": 9164 + }, + { + "epoch": 3.383214373615555, + "grad_norm": 0.8879731893539429, + "learning_rate": 2.0209830361020307e-06, + "loss": 0.5914, + "step": 9165 + }, + { + "epoch": 3.383583558946591, + "grad_norm": 0.909413754940033, + "learning_rate": 2.0205056819505494e-06, + "loss": 0.6128, + "step": 9166 + }, + { + "epoch": 3.3839527442776274, + "grad_norm": 0.9037505388259888, + "learning_rate": 2.0200283459477777e-06, + "loss": 0.5607, + "step": 9167 + }, + { + "epoch": 3.3843219296086637, + "grad_norm": 0.9324991703033447, + "learning_rate": 2.019551028111781e-06, + "loss": 0.5955, + "step": 9168 + }, + { + "epoch": 3.3846911149396997, + "grad_norm": 0.9363679885864258, + "learning_rate": 2.019073728460627e-06, + "loss": 0.5883, + "step": 9169 + }, + { + "epoch": 3.385060300270736, + "grad_norm": 0.9320642948150635, + "learning_rate": 2.01859644701238e-06, + "loss": 0.5632, + "step": 9170 + }, + { + "epoch": 3.385429485601772, + "grad_norm": 0.8993182182312012, + "learning_rate": 2.0181191837851055e-06, + "loss": 0.5736, + "step": 9171 + }, + { + "epoch": 3.3857986709328083, + "grad_norm": 0.9207606315612793, + "learning_rate": 2.017641938796868e-06, + "loss": 0.603, + "step": 9172 + }, + { + "epoch": 3.386167856263844, + "grad_norm": 0.931074857711792, + "learning_rate": 2.0171647120657304e-06, + "loss": 0.585, + "step": 9173 + }, + { + "epoch": 3.3865370415948806, + "grad_norm": 0.9188339710235596, + "learning_rate": 2.016687503609757e-06, + "loss": 0.6043, + "step": 9174 + }, + { + "epoch": 3.386906226925917, + "grad_norm": 0.8974730968475342, + "learning_rate": 2.016210313447008e-06, + "loss": 0.5861, + "step": 9175 + }, + { + "epoch": 3.387275412256953, + "grad_norm": 0.9416146278381348, + "learning_rate": 2.015733141595546e-06, + "loss": 0.6072, + "step": 9176 + }, + { + "epoch": 3.387644597587989, + "grad_norm": 0.9465506076812744, + "learning_rate": 2.015255988073433e-06, + "loss": 0.6039, + "step": 9177 + }, + { + "epoch": 3.3880137829190256, + "grad_norm": 0.9561184048652649, + "learning_rate": 2.014778852898727e-06, + "loss": 0.614, + "step": 9178 + }, + { + "epoch": 3.3883829682500615, + "grad_norm": 0.9234836101531982, + "learning_rate": 2.014301736089489e-06, + "loss": 0.5739, + "step": 9179 + }, + { + "epoch": 3.388752153581098, + "grad_norm": 0.9124407768249512, + "learning_rate": 2.013824637663776e-06, + "loss": 0.5938, + "step": 9180 + }, + { + "epoch": 3.3891213389121337, + "grad_norm": 0.9100334048271179, + "learning_rate": 2.013347557639647e-06, + "loss": 0.5955, + "step": 9181 + }, + { + "epoch": 3.38949052424317, + "grad_norm": 0.9076642990112305, + "learning_rate": 2.0128704960351603e-06, + "loss": 0.5869, + "step": 9182 + }, + { + "epoch": 3.389859709574206, + "grad_norm": 0.940141499042511, + "learning_rate": 2.01239345286837e-06, + "loss": 0.5977, + "step": 9183 + }, + { + "epoch": 3.3902288949052424, + "grad_norm": 0.9094479084014893, + "learning_rate": 2.011916428157336e-06, + "loss": 0.6276, + "step": 9184 + }, + { + "epoch": 3.3905980802362787, + "grad_norm": 0.9013211727142334, + "learning_rate": 2.0114394219201087e-06, + "loss": 0.6228, + "step": 9185 + }, + { + "epoch": 3.3909672655673146, + "grad_norm": 0.9273747801780701, + "learning_rate": 2.0109624341747465e-06, + "loss": 0.5964, + "step": 9186 + }, + { + "epoch": 3.391336450898351, + "grad_norm": 0.9030047655105591, + "learning_rate": 2.010485464939302e-06, + "loss": 0.5658, + "step": 9187 + }, + { + "epoch": 3.3917056362293874, + "grad_norm": 0.9187841415405273, + "learning_rate": 2.0100085142318274e-06, + "loss": 0.5925, + "step": 9188 + }, + { + "epoch": 3.3920748215604233, + "grad_norm": 0.9127846360206604, + "learning_rate": 2.0095315820703766e-06, + "loss": 0.5679, + "step": 9189 + }, + { + "epoch": 3.3924440068914596, + "grad_norm": 0.9222260117530823, + "learning_rate": 2.009054668473e-06, + "loss": 0.5955, + "step": 9190 + }, + { + "epoch": 3.3928131922224956, + "grad_norm": 0.8731462955474854, + "learning_rate": 2.008577773457749e-06, + "loss": 0.5516, + "step": 9191 + }, + { + "epoch": 3.393182377553532, + "grad_norm": 0.9500693082809448, + "learning_rate": 2.0081008970426753e-06, + "loss": 0.5954, + "step": 9192 + }, + { + "epoch": 3.393551562884568, + "grad_norm": 0.9457334280014038, + "learning_rate": 2.0076240392458256e-06, + "loss": 0.5972, + "step": 9193 + }, + { + "epoch": 3.393920748215604, + "grad_norm": 0.9222469329833984, + "learning_rate": 2.0071472000852517e-06, + "loss": 0.5803, + "step": 9194 + }, + { + "epoch": 3.3942899335466405, + "grad_norm": 0.8989402651786804, + "learning_rate": 2.0066703795790012e-06, + "loss": 0.5829, + "step": 9195 + }, + { + "epoch": 3.3946591188776765, + "grad_norm": 0.9355420470237732, + "learning_rate": 2.0061935777451207e-06, + "loss": 0.5896, + "step": 9196 + }, + { + "epoch": 3.395028304208713, + "grad_norm": 0.9048003554344177, + "learning_rate": 2.005716794601658e-06, + "loss": 0.5856, + "step": 9197 + }, + { + "epoch": 3.395397489539749, + "grad_norm": 0.9305781722068787, + "learning_rate": 2.0052400301666576e-06, + "loss": 0.5743, + "step": 9198 + }, + { + "epoch": 3.395766674870785, + "grad_norm": 0.9108872413635254, + "learning_rate": 2.004763284458166e-06, + "loss": 0.5829, + "step": 9199 + }, + { + "epoch": 3.3961358602018215, + "grad_norm": 0.9210634231567383, + "learning_rate": 2.0042865574942288e-06, + "loss": 0.6191, + "step": 9200 + }, + { + "epoch": 3.3965050455328574, + "grad_norm": 0.8966801762580872, + "learning_rate": 2.003809849292888e-06, + "loss": 0.567, + "step": 9201 + }, + { + "epoch": 3.3968742308638937, + "grad_norm": 0.9140282869338989, + "learning_rate": 2.003333159872189e-06, + "loss": 0.6026, + "step": 9202 + }, + { + "epoch": 3.3972434161949296, + "grad_norm": 0.920219361782074, + "learning_rate": 2.002856489250171e-06, + "loss": 0.5862, + "step": 9203 + }, + { + "epoch": 3.397612601525966, + "grad_norm": 1.0405218601226807, + "learning_rate": 2.0023798374448796e-06, + "loss": 0.6122, + "step": 9204 + }, + { + "epoch": 3.3979817868570024, + "grad_norm": 0.9411656856536865, + "learning_rate": 2.001903204474354e-06, + "loss": 0.6215, + "step": 9205 + }, + { + "epoch": 3.3983509721880383, + "grad_norm": 0.9052700400352478, + "learning_rate": 2.001426590356635e-06, + "loss": 0.5753, + "step": 9206 + }, + { + "epoch": 3.3987201575190746, + "grad_norm": 0.9159415364265442, + "learning_rate": 2.0009499951097623e-06, + "loss": 0.5784, + "step": 9207 + }, + { + "epoch": 3.399089342850111, + "grad_norm": 0.9144145846366882, + "learning_rate": 2.0004734187517744e-06, + "loss": 0.5817, + "step": 9208 + }, + { + "epoch": 3.399458528181147, + "grad_norm": 0.9210705757141113, + "learning_rate": 1.9999968613007097e-06, + "loss": 0.5866, + "step": 9209 + }, + { + "epoch": 3.3998277135121833, + "grad_norm": 0.9279311895370483, + "learning_rate": 1.999520322774607e-06, + "loss": 0.5985, + "step": 9210 + }, + { + "epoch": 3.400196898843219, + "grad_norm": 0.9307151436805725, + "learning_rate": 1.9990438031915015e-06, + "loss": 0.6062, + "step": 9211 + }, + { + "epoch": 3.4005660841742555, + "grad_norm": 0.9310771226882935, + "learning_rate": 1.99856730256943e-06, + "loss": 0.5795, + "step": 9212 + }, + { + "epoch": 3.4009352695052915, + "grad_norm": 0.9312301874160767, + "learning_rate": 1.998090820926427e-06, + "loss": 0.6192, + "step": 9213 + }, + { + "epoch": 3.401304454836328, + "grad_norm": 0.9165865182876587, + "learning_rate": 1.997614358280529e-06, + "loss": 0.5662, + "step": 9214 + }, + { + "epoch": 3.401673640167364, + "grad_norm": 0.9230973720550537, + "learning_rate": 1.997137914649769e-06, + "loss": 0.5837, + "step": 9215 + }, + { + "epoch": 3.4020428254984, + "grad_norm": 0.8961165547370911, + "learning_rate": 1.99666149005218e-06, + "loss": 0.5653, + "step": 9216 + }, + { + "epoch": 3.4024120108294365, + "grad_norm": 0.9317620396614075, + "learning_rate": 1.996185084505795e-06, + "loss": 0.5824, + "step": 9217 + }, + { + "epoch": 3.4027811961604724, + "grad_norm": 0.9099761247634888, + "learning_rate": 1.9957086980286456e-06, + "loss": 0.5592, + "step": 9218 + }, + { + "epoch": 3.4031503814915087, + "grad_norm": 0.9393738508224487, + "learning_rate": 1.995232330638762e-06, + "loss": 0.5697, + "step": 9219 + }, + { + "epoch": 3.403519566822545, + "grad_norm": 0.8805381059646606, + "learning_rate": 1.9947559823541764e-06, + "loss": 0.5467, + "step": 9220 + }, + { + "epoch": 3.403888752153581, + "grad_norm": 0.9490686655044556, + "learning_rate": 1.9942796531929167e-06, + "loss": 0.611, + "step": 9221 + }, + { + "epoch": 3.4042579374846174, + "grad_norm": 0.9150283932685852, + "learning_rate": 1.9938033431730135e-06, + "loss": 0.6116, + "step": 9222 + }, + { + "epoch": 3.4046271228156533, + "grad_norm": 0.9304237365722656, + "learning_rate": 1.9933270523124924e-06, + "loss": 0.5699, + "step": 9223 + }, + { + "epoch": 3.4049963081466896, + "grad_norm": 0.924985945224762, + "learning_rate": 1.992850780629383e-06, + "loss": 0.5526, + "step": 9224 + }, + { + "epoch": 3.405365493477726, + "grad_norm": 0.9181817770004272, + "learning_rate": 1.992374528141712e-06, + "loss": 0.5936, + "step": 9225 + }, + { + "epoch": 3.405734678808762, + "grad_norm": 0.9083108305931091, + "learning_rate": 1.9918982948675048e-06, + "loss": 0.5751, + "step": 9226 + }, + { + "epoch": 3.4061038641397983, + "grad_norm": 0.9422109127044678, + "learning_rate": 1.991422080824787e-06, + "loss": 0.5906, + "step": 9227 + }, + { + "epoch": 3.406473049470834, + "grad_norm": 0.9159759283065796, + "learning_rate": 1.990945886031583e-06, + "loss": 0.5749, + "step": 9228 + }, + { + "epoch": 3.4068422348018705, + "grad_norm": 0.928768515586853, + "learning_rate": 1.9904697105059157e-06, + "loss": 0.6053, + "step": 9229 + }, + { + "epoch": 3.407211420132907, + "grad_norm": 0.8994535803794861, + "learning_rate": 1.98999355426581e-06, + "loss": 0.5749, + "step": 9230 + }, + { + "epoch": 3.407580605463943, + "grad_norm": 0.9047354459762573, + "learning_rate": 1.9895174173292863e-06, + "loss": 0.5824, + "step": 9231 + }, + { + "epoch": 3.407949790794979, + "grad_norm": 0.9167147278785706, + "learning_rate": 1.989041299714367e-06, + "loss": 0.5985, + "step": 9232 + }, + { + "epoch": 3.408318976126015, + "grad_norm": 0.9674847722053528, + "learning_rate": 1.9885652014390745e-06, + "loss": 0.6058, + "step": 9233 + }, + { + "epoch": 3.4086881614570514, + "grad_norm": 0.9516264796257019, + "learning_rate": 1.9880891225214273e-06, + "loss": 0.6099, + "step": 9234 + }, + { + "epoch": 3.409057346788088, + "grad_norm": 0.903702974319458, + "learning_rate": 1.9876130629794454e-06, + "loss": 0.5687, + "step": 9235 + }, + { + "epoch": 3.4094265321191237, + "grad_norm": 0.9248884916305542, + "learning_rate": 1.987137022831147e-06, + "loss": 0.601, + "step": 9236 + }, + { + "epoch": 3.40979571745016, + "grad_norm": 0.903754711151123, + "learning_rate": 1.9866610020945506e-06, + "loss": 0.5995, + "step": 9237 + }, + { + "epoch": 3.410164902781196, + "grad_norm": 0.9512732625007629, + "learning_rate": 1.9861850007876737e-06, + "loss": 0.5787, + "step": 9238 + }, + { + "epoch": 3.4105340881122324, + "grad_norm": 0.9387043118476868, + "learning_rate": 1.9857090189285317e-06, + "loss": 0.6039, + "step": 9239 + }, + { + "epoch": 3.4109032734432687, + "grad_norm": 0.9178261756896973, + "learning_rate": 1.9852330565351416e-06, + "loss": 0.5892, + "step": 9240 + }, + { + "epoch": 3.4112724587743046, + "grad_norm": 0.9243996143341064, + "learning_rate": 1.9847571136255167e-06, + "loss": 0.5805, + "step": 9241 + }, + { + "epoch": 3.411641644105341, + "grad_norm": 0.932486891746521, + "learning_rate": 1.984281190217673e-06, + "loss": 0.5808, + "step": 9242 + }, + { + "epoch": 3.412010829436377, + "grad_norm": 0.954239010810852, + "learning_rate": 1.983805286329624e-06, + "loss": 0.6445, + "step": 9243 + }, + { + "epoch": 3.4123800147674133, + "grad_norm": 0.8958541750907898, + "learning_rate": 1.983329401979382e-06, + "loss": 0.5826, + "step": 9244 + }, + { + "epoch": 3.4127492000984496, + "grad_norm": 0.9305569529533386, + "learning_rate": 1.982853537184959e-06, + "loss": 0.5647, + "step": 9245 + }, + { + "epoch": 3.4131183854294855, + "grad_norm": 0.9339607357978821, + "learning_rate": 1.9823776919643656e-06, + "loss": 0.5829, + "step": 9246 + }, + { + "epoch": 3.413487570760522, + "grad_norm": 0.9719569683074951, + "learning_rate": 1.9819018663356137e-06, + "loss": 0.5862, + "step": 9247 + }, + { + "epoch": 3.413856756091558, + "grad_norm": 0.9123527407646179, + "learning_rate": 1.981426060316713e-06, + "loss": 0.5679, + "step": 9248 + }, + { + "epoch": 3.414225941422594, + "grad_norm": 0.9175556898117065, + "learning_rate": 1.980950273925672e-06, + "loss": 0.5918, + "step": 9249 + }, + { + "epoch": 3.41459512675363, + "grad_norm": 0.9488082528114319, + "learning_rate": 1.9804745071804996e-06, + "loss": 0.5945, + "step": 9250 + }, + { + "epoch": 3.4149643120846664, + "grad_norm": 0.9527239203453064, + "learning_rate": 1.9799987600992022e-06, + "loss": 0.5757, + "step": 9251 + }, + { + "epoch": 3.415333497415703, + "grad_norm": 0.961338996887207, + "learning_rate": 1.9795230326997877e-06, + "loss": 0.6191, + "step": 9252 + }, + { + "epoch": 3.4157026827467387, + "grad_norm": 0.9356257915496826, + "learning_rate": 1.979047325000263e-06, + "loss": 0.5993, + "step": 9253 + }, + { + "epoch": 3.416071868077775, + "grad_norm": 0.9302783608436584, + "learning_rate": 1.9785716370186325e-06, + "loss": 0.6054, + "step": 9254 + }, + { + "epoch": 3.4164410534088114, + "grad_norm": 0.9433194994926453, + "learning_rate": 1.978095968772901e-06, + "loss": 0.5492, + "step": 9255 + }, + { + "epoch": 3.4168102387398473, + "grad_norm": 0.9477927684783936, + "learning_rate": 1.977620320281072e-06, + "loss": 0.5988, + "step": 9256 + }, + { + "epoch": 3.4171794240708837, + "grad_norm": 0.9441022872924805, + "learning_rate": 1.977144691561149e-06, + "loss": 0.5913, + "step": 9257 + }, + { + "epoch": 3.4175486094019196, + "grad_norm": 0.8984287977218628, + "learning_rate": 1.9766690826311348e-06, + "loss": 0.5796, + "step": 9258 + }, + { + "epoch": 3.417917794732956, + "grad_norm": 0.9031214714050293, + "learning_rate": 1.9761934935090306e-06, + "loss": 0.5853, + "step": 9259 + }, + { + "epoch": 3.418286980063992, + "grad_norm": 0.9403338432312012, + "learning_rate": 1.9757179242128376e-06, + "loss": 0.5849, + "step": 9260 + }, + { + "epoch": 3.4186561653950283, + "grad_norm": 0.9294829368591309, + "learning_rate": 1.975242374760554e-06, + "loss": 0.6147, + "step": 9261 + }, + { + "epoch": 3.4190253507260646, + "grad_norm": 0.9589325785636902, + "learning_rate": 1.9747668451701825e-06, + "loss": 0.5948, + "step": 9262 + }, + { + "epoch": 3.4193945360571005, + "grad_norm": 0.9766724109649658, + "learning_rate": 1.9742913354597203e-06, + "loss": 0.6047, + "step": 9263 + }, + { + "epoch": 3.419763721388137, + "grad_norm": 0.9174277186393738, + "learning_rate": 1.9738158456471643e-06, + "loss": 0.6206, + "step": 9264 + }, + { + "epoch": 3.4201329067191732, + "grad_norm": 0.9277015924453735, + "learning_rate": 1.9733403757505134e-06, + "loss": 0.5878, + "step": 9265 + }, + { + "epoch": 3.420502092050209, + "grad_norm": 0.9695833325386047, + "learning_rate": 1.9728649257877617e-06, + "loss": 0.6147, + "step": 9266 + }, + { + "epoch": 3.4208712773812455, + "grad_norm": 0.9049524664878845, + "learning_rate": 1.972389495776907e-06, + "loss": 0.592, + "step": 9267 + }, + { + "epoch": 3.4212404627122814, + "grad_norm": 0.9180031418800354, + "learning_rate": 1.971914085735944e-06, + "loss": 0.5612, + "step": 9268 + }, + { + "epoch": 3.421609648043318, + "grad_norm": 0.926209032535553, + "learning_rate": 1.971438695682865e-06, + "loss": 0.5515, + "step": 9269 + }, + { + "epoch": 3.4219788333743537, + "grad_norm": 0.9417392611503601, + "learning_rate": 1.970963325635665e-06, + "loss": 0.5896, + "step": 9270 + }, + { + "epoch": 3.42234801870539, + "grad_norm": 0.9731624722480774, + "learning_rate": 1.9704879756123363e-06, + "loss": 0.6212, + "step": 9271 + }, + { + "epoch": 3.4227172040364264, + "grad_norm": 0.8980256915092468, + "learning_rate": 1.9700126456308704e-06, + "loss": 0.6035, + "step": 9272 + }, + { + "epoch": 3.4230863893674623, + "grad_norm": 0.9253155589103699, + "learning_rate": 1.969537335709259e-06, + "loss": 0.6054, + "step": 9273 + }, + { + "epoch": 3.4234555746984987, + "grad_norm": 0.9410659670829773, + "learning_rate": 1.9690620458654917e-06, + "loss": 0.6085, + "step": 9274 + }, + { + "epoch": 3.423824760029535, + "grad_norm": 0.915116012096405, + "learning_rate": 1.9685867761175584e-06, + "loss": 0.5651, + "step": 9275 + }, + { + "epoch": 3.424193945360571, + "grad_norm": 0.9523504376411438, + "learning_rate": 1.9681115264834482e-06, + "loss": 0.6107, + "step": 9276 + }, + { + "epoch": 3.4245631306916073, + "grad_norm": 0.92009037733078, + "learning_rate": 1.9676362969811487e-06, + "loss": 0.5893, + "step": 9277 + }, + { + "epoch": 3.4249323160226433, + "grad_norm": 0.9476304650306702, + "learning_rate": 1.9671610876286475e-06, + "loss": 0.5951, + "step": 9278 + }, + { + "epoch": 3.4253015013536796, + "grad_norm": 0.9166354537010193, + "learning_rate": 1.966685898443931e-06, + "loss": 0.5809, + "step": 9279 + }, + { + "epoch": 3.4256706866847155, + "grad_norm": 0.9498445987701416, + "learning_rate": 1.9662107294449834e-06, + "loss": 0.604, + "step": 9280 + }, + { + "epoch": 3.426039872015752, + "grad_norm": 0.9234588146209717, + "learning_rate": 1.9657355806497933e-06, + "loss": 0.5961, + "step": 9281 + }, + { + "epoch": 3.4264090573467882, + "grad_norm": 0.9325228333473206, + "learning_rate": 1.965260452076342e-06, + "loss": 0.6054, + "step": 9282 + }, + { + "epoch": 3.426778242677824, + "grad_norm": 0.9039566516876221, + "learning_rate": 1.964785343742614e-06, + "loss": 0.6075, + "step": 9283 + }, + { + "epoch": 3.4271474280088605, + "grad_norm": 0.932978093624115, + "learning_rate": 1.9643102556665917e-06, + "loss": 0.5878, + "step": 9284 + }, + { + "epoch": 3.4275166133398964, + "grad_norm": 0.9668893814086914, + "learning_rate": 1.963835187866257e-06, + "loss": 0.608, + "step": 9285 + }, + { + "epoch": 3.427885798670933, + "grad_norm": 0.9297093749046326, + "learning_rate": 1.9633601403595917e-06, + "loss": 0.5917, + "step": 9286 + }, + { + "epoch": 3.428254984001969, + "grad_norm": 0.8989117741584778, + "learning_rate": 1.962885113164575e-06, + "loss": 0.5363, + "step": 9287 + }, + { + "epoch": 3.428624169333005, + "grad_norm": 0.9306706190109253, + "learning_rate": 1.962410106299188e-06, + "loss": 0.5726, + "step": 9288 + }, + { + "epoch": 3.4289933546640414, + "grad_norm": 0.9187400341033936, + "learning_rate": 1.9619351197814085e-06, + "loss": 0.5902, + "step": 9289 + }, + { + "epoch": 3.4293625399950773, + "grad_norm": 0.910029947757721, + "learning_rate": 1.961460153629214e-06, + "loss": 0.6115, + "step": 9290 + }, + { + "epoch": 3.4297317253261137, + "grad_norm": 0.9777998328208923, + "learning_rate": 1.960985207860584e-06, + "loss": 0.6151, + "step": 9291 + }, + { + "epoch": 3.43010091065715, + "grad_norm": 0.9136645197868347, + "learning_rate": 1.9605102824934936e-06, + "loss": 0.5853, + "step": 9292 + }, + { + "epoch": 3.430470095988186, + "grad_norm": 0.9364112615585327, + "learning_rate": 1.9600353775459184e-06, + "loss": 0.6189, + "step": 9293 + }, + { + "epoch": 3.4308392813192223, + "grad_norm": 0.9367743730545044, + "learning_rate": 1.9595604930358337e-06, + "loss": 0.5541, + "step": 9294 + }, + { + "epoch": 3.4312084666502582, + "grad_norm": 0.9225846529006958, + "learning_rate": 1.959085628981214e-06, + "loss": 0.5586, + "step": 9295 + }, + { + "epoch": 3.4315776519812946, + "grad_norm": 0.916131854057312, + "learning_rate": 1.9586107854000327e-06, + "loss": 0.5778, + "step": 9296 + }, + { + "epoch": 3.431946837312331, + "grad_norm": 0.9251326322555542, + "learning_rate": 1.958135962310262e-06, + "loss": 0.5868, + "step": 9297 + }, + { + "epoch": 3.432316022643367, + "grad_norm": 0.9161785244941711, + "learning_rate": 1.9576611597298745e-06, + "loss": 0.5712, + "step": 9298 + }, + { + "epoch": 3.4326852079744032, + "grad_norm": 0.9277284741401672, + "learning_rate": 1.9571863776768404e-06, + "loss": 0.5894, + "step": 9299 + }, + { + "epoch": 3.433054393305439, + "grad_norm": 0.9279661774635315, + "learning_rate": 1.95671161616913e-06, + "loss": 0.6618, + "step": 9300 + }, + { + "epoch": 3.4334235786364755, + "grad_norm": 0.9188452363014221, + "learning_rate": 1.956236875224715e-06, + "loss": 0.6041, + "step": 9301 + }, + { + "epoch": 3.433792763967512, + "grad_norm": 0.9370484352111816, + "learning_rate": 1.9557621548615614e-06, + "loss": 0.6018, + "step": 9302 + }, + { + "epoch": 3.434161949298548, + "grad_norm": 0.9057812690734863, + "learning_rate": 1.9552874550976392e-06, + "loss": 0.5939, + "step": 9303 + }, + { + "epoch": 3.434531134629584, + "grad_norm": 0.9344809055328369, + "learning_rate": 1.954812775950915e-06, + "loss": 0.6129, + "step": 9304 + }, + { + "epoch": 3.43490031996062, + "grad_norm": 0.9341896772384644, + "learning_rate": 1.9543381174393545e-06, + "loss": 0.5788, + "step": 9305 + }, + { + "epoch": 3.4352695052916564, + "grad_norm": 0.9270790815353394, + "learning_rate": 1.9538634795809247e-06, + "loss": 0.5496, + "step": 9306 + }, + { + "epoch": 3.435638690622693, + "grad_norm": 0.9269886016845703, + "learning_rate": 1.9533888623935894e-06, + "loss": 0.5917, + "step": 9307 + }, + { + "epoch": 3.4360078759537287, + "grad_norm": 0.9198427796363831, + "learning_rate": 1.9529142658953125e-06, + "loss": 0.622, + "step": 9308 + }, + { + "epoch": 3.436377061284765, + "grad_norm": 0.9565939903259277, + "learning_rate": 1.95243969010406e-06, + "loss": 0.5953, + "step": 9309 + }, + { + "epoch": 3.436746246615801, + "grad_norm": 0.947950005531311, + "learning_rate": 1.9519651350377903e-06, + "loss": 0.5965, + "step": 9310 + }, + { + "epoch": 3.4371154319468373, + "grad_norm": 0.9178338646888733, + "learning_rate": 1.9514906007144686e-06, + "loss": 0.5378, + "step": 9311 + }, + { + "epoch": 3.4374846172778737, + "grad_norm": 0.9171031713485718, + "learning_rate": 1.951016087152054e-06, + "loss": 0.5727, + "step": 9312 + }, + { + "epoch": 3.4378538026089096, + "grad_norm": 0.9688862562179565, + "learning_rate": 1.9505415943685073e-06, + "loss": 0.6443, + "step": 9313 + }, + { + "epoch": 3.438222987939946, + "grad_norm": 0.9189727306365967, + "learning_rate": 1.950067122381788e-06, + "loss": 0.5769, + "step": 9314 + }, + { + "epoch": 3.438592173270982, + "grad_norm": 0.929885983467102, + "learning_rate": 1.9495926712098547e-06, + "loss": 0.6393, + "step": 9315 + }, + { + "epoch": 3.4389613586020182, + "grad_norm": 0.8990638852119446, + "learning_rate": 1.949118240870665e-06, + "loss": 0.5802, + "step": 9316 + }, + { + "epoch": 3.439330543933054, + "grad_norm": 0.9096630215644836, + "learning_rate": 1.9486438313821755e-06, + "loss": 0.5572, + "step": 9317 + }, + { + "epoch": 3.4396997292640905, + "grad_norm": 0.9065389037132263, + "learning_rate": 1.9481694427623425e-06, + "loss": 0.5486, + "step": 9318 + }, + { + "epoch": 3.440068914595127, + "grad_norm": 0.9339439272880554, + "learning_rate": 1.9476950750291236e-06, + "loss": 0.6178, + "step": 9319 + }, + { + "epoch": 3.440438099926163, + "grad_norm": 0.9283116459846497, + "learning_rate": 1.9472207282004702e-06, + "loss": 0.6043, + "step": 9320 + }, + { + "epoch": 3.440807285257199, + "grad_norm": 0.9257859587669373, + "learning_rate": 1.946746402294339e-06, + "loss": 0.6115, + "step": 9321 + }, + { + "epoch": 3.4411764705882355, + "grad_norm": 0.9471895694732666, + "learning_rate": 1.946272097328681e-06, + "loss": 0.6067, + "step": 9322 + }, + { + "epoch": 3.4415456559192714, + "grad_norm": 0.9470584392547607, + "learning_rate": 1.94579781332145e-06, + "loss": 0.5893, + "step": 9323 + }, + { + "epoch": 3.4419148412503078, + "grad_norm": 0.929463267326355, + "learning_rate": 1.945323550290597e-06, + "loss": 0.6229, + "step": 9324 + }, + { + "epoch": 3.4422840265813437, + "grad_norm": 0.9108128547668457, + "learning_rate": 1.944849308254072e-06, + "loss": 0.593, + "step": 9325 + }, + { + "epoch": 3.44265321191238, + "grad_norm": 0.9169228076934814, + "learning_rate": 1.944375087229826e-06, + "loss": 0.5539, + "step": 9326 + }, + { + "epoch": 3.443022397243416, + "grad_norm": 0.9178633093833923, + "learning_rate": 1.943900887235807e-06, + "loss": 0.5697, + "step": 9327 + }, + { + "epoch": 3.4433915825744523, + "grad_norm": 0.9168693423271179, + "learning_rate": 1.9434267082899637e-06, + "loss": 0.6015, + "step": 9328 + }, + { + "epoch": 3.4437607679054887, + "grad_norm": 0.9193868041038513, + "learning_rate": 1.942952550410245e-06, + "loss": 0.5569, + "step": 9329 + }, + { + "epoch": 3.4441299532365246, + "grad_norm": 0.9076756238937378, + "learning_rate": 1.9424784136145952e-06, + "loss": 0.5629, + "step": 9330 + }, + { + "epoch": 3.444499138567561, + "grad_norm": 0.9478167295455933, + "learning_rate": 1.942004297920962e-06, + "loss": 0.6371, + "step": 9331 + }, + { + "epoch": 3.4448683238985973, + "grad_norm": 0.9282375574111938, + "learning_rate": 1.9415302033472902e-06, + "loss": 0.5742, + "step": 9332 + }, + { + "epoch": 3.4452375092296332, + "grad_norm": 0.9030489325523376, + "learning_rate": 1.9410561299115234e-06, + "loss": 0.5824, + "step": 9333 + }, + { + "epoch": 3.4456066945606696, + "grad_norm": 0.937932550907135, + "learning_rate": 1.940582077631607e-06, + "loss": 0.6234, + "step": 9334 + }, + { + "epoch": 3.4459758798917055, + "grad_norm": 0.9232563972473145, + "learning_rate": 1.940108046525481e-06, + "loss": 0.5882, + "step": 9335 + }, + { + "epoch": 3.446345065222742, + "grad_norm": 0.944292426109314, + "learning_rate": 1.939634036611089e-06, + "loss": 0.5937, + "step": 9336 + }, + { + "epoch": 3.446714250553778, + "grad_norm": 0.9111289381980896, + "learning_rate": 1.9391600479063723e-06, + "loss": 0.5829, + "step": 9337 + }, + { + "epoch": 3.447083435884814, + "grad_norm": 0.9065266251564026, + "learning_rate": 1.93868608042927e-06, + "loss": 0.5909, + "step": 9338 + }, + { + "epoch": 3.4474526212158505, + "grad_norm": 0.903954803943634, + "learning_rate": 1.938212134197724e-06, + "loss": 0.5846, + "step": 9339 + }, + { + "epoch": 3.4478218065468864, + "grad_norm": 0.9028177261352539, + "learning_rate": 1.9377382092296697e-06, + "loss": 0.578, + "step": 9340 + }, + { + "epoch": 3.4481909918779228, + "grad_norm": 0.9523911476135254, + "learning_rate": 1.937264305543047e-06, + "loss": 0.6393, + "step": 9341 + }, + { + "epoch": 3.448560177208959, + "grad_norm": 0.9292596578598022, + "learning_rate": 1.9367904231557934e-06, + "loss": 0.5921, + "step": 9342 + }, + { + "epoch": 3.448929362539995, + "grad_norm": 0.9520934820175171, + "learning_rate": 1.9363165620858444e-06, + "loss": 0.6032, + "step": 9343 + }, + { + "epoch": 3.4492985478710314, + "grad_norm": 0.9022828340530396, + "learning_rate": 1.9358427223511357e-06, + "loss": 0.5908, + "step": 9344 + }, + { + "epoch": 3.4496677332020673, + "grad_norm": 0.9262678027153015, + "learning_rate": 1.9353689039696017e-06, + "loss": 0.5883, + "step": 9345 + }, + { + "epoch": 3.4500369185331037, + "grad_norm": 0.9371560215950012, + "learning_rate": 1.9348951069591756e-06, + "loss": 0.6243, + "step": 9346 + }, + { + "epoch": 3.4504061038641396, + "grad_norm": 0.9151279926300049, + "learning_rate": 1.9344213313377934e-06, + "loss": 0.6015, + "step": 9347 + }, + { + "epoch": 3.450775289195176, + "grad_norm": 0.8878635764122009, + "learning_rate": 1.9339475771233834e-06, + "loss": 0.545, + "step": 9348 + }, + { + "epoch": 3.4511444745262123, + "grad_norm": 0.9339662194252014, + "learning_rate": 1.9334738443338807e-06, + "loss": 0.6204, + "step": 9349 + }, + { + "epoch": 3.4515136598572482, + "grad_norm": 0.9327067732810974, + "learning_rate": 1.9330001329872123e-06, + "loss": 0.6164, + "step": 9350 + }, + { + "epoch": 3.4518828451882846, + "grad_norm": 0.9263085722923279, + "learning_rate": 1.9325264431013107e-06, + "loss": 0.5997, + "step": 9351 + }, + { + "epoch": 3.452252030519321, + "grad_norm": 0.9228039979934692, + "learning_rate": 1.9320527746941046e-06, + "loss": 0.5871, + "step": 9352 + }, + { + "epoch": 3.452621215850357, + "grad_norm": 0.9101688861846924, + "learning_rate": 1.931579127783521e-06, + "loss": 0.585, + "step": 9353 + }, + { + "epoch": 3.452990401181393, + "grad_norm": 0.8834836483001709, + "learning_rate": 1.931105502387489e-06, + "loss": 0.5548, + "step": 9354 + }, + { + "epoch": 3.453359586512429, + "grad_norm": 0.9102174043655396, + "learning_rate": 1.930631898523933e-06, + "loss": 0.6212, + "step": 9355 + }, + { + "epoch": 3.4537287718434655, + "grad_norm": 0.9224945306777954, + "learning_rate": 1.93015831621078e-06, + "loss": 0.5901, + "step": 9356 + }, + { + "epoch": 3.4540979571745014, + "grad_norm": 0.9285571575164795, + "learning_rate": 1.9296847554659564e-06, + "loss": 0.5639, + "step": 9357 + }, + { + "epoch": 3.4544671425055378, + "grad_norm": 0.9173696041107178, + "learning_rate": 1.9292112163073833e-06, + "loss": 0.58, + "step": 9358 + }, + { + "epoch": 3.454836327836574, + "grad_norm": 0.9285219311714172, + "learning_rate": 1.928737698752987e-06, + "loss": 0.5811, + "step": 9359 + }, + { + "epoch": 3.45520551316761, + "grad_norm": 0.9212900400161743, + "learning_rate": 1.9282642028206866e-06, + "loss": 0.5975, + "step": 9360 + }, + { + "epoch": 3.4555746984986464, + "grad_norm": 0.92405104637146, + "learning_rate": 1.927790728528407e-06, + "loss": 0.571, + "step": 9361 + }, + { + "epoch": 3.4559438838296823, + "grad_norm": 0.9488052129745483, + "learning_rate": 1.9273172758940672e-06, + "loss": 0.613, + "step": 9362 + }, + { + "epoch": 3.4563130691607187, + "grad_norm": 0.9641391634941101, + "learning_rate": 1.9268438449355883e-06, + "loss": 0.5797, + "step": 9363 + }, + { + "epoch": 3.456682254491755, + "grad_norm": 0.9415781497955322, + "learning_rate": 1.9263704356708885e-06, + "loss": 0.6363, + "step": 9364 + }, + { + "epoch": 3.457051439822791, + "grad_norm": 0.9348901510238647, + "learning_rate": 1.9258970481178866e-06, + "loss": 0.5771, + "step": 9365 + }, + { + "epoch": 3.4574206251538273, + "grad_norm": 0.9179986715316772, + "learning_rate": 1.9254236822944998e-06, + "loss": 0.5807, + "step": 9366 + }, + { + "epoch": 3.4577898104848632, + "grad_norm": 0.9296615123748779, + "learning_rate": 1.924950338218647e-06, + "loss": 0.5903, + "step": 9367 + }, + { + "epoch": 3.4581589958158996, + "grad_norm": 0.9373036623001099, + "learning_rate": 1.9244770159082405e-06, + "loss": 0.5957, + "step": 9368 + }, + { + "epoch": 3.458528181146936, + "grad_norm": 0.9135959148406982, + "learning_rate": 1.9240037153811993e-06, + "loss": 0.5781, + "step": 9369 + }, + { + "epoch": 3.458897366477972, + "grad_norm": 0.9020964503288269, + "learning_rate": 1.9235304366554334e-06, + "loss": 0.5875, + "step": 9370 + }, + { + "epoch": 3.459266551809008, + "grad_norm": 0.8970407247543335, + "learning_rate": 1.9230571797488593e-06, + "loss": 0.5652, + "step": 9371 + }, + { + "epoch": 3.459635737140044, + "grad_norm": 0.9273808598518372, + "learning_rate": 1.9225839446793897e-06, + "loss": 0.5847, + "step": 9372 + }, + { + "epoch": 3.4600049224710805, + "grad_norm": 0.9242944717407227, + "learning_rate": 1.922110731464935e-06, + "loss": 0.5578, + "step": 9373 + }, + { + "epoch": 3.460374107802117, + "grad_norm": 0.9473053216934204, + "learning_rate": 1.9216375401234074e-06, + "loss": 0.5893, + "step": 9374 + }, + { + "epoch": 3.4607432931331528, + "grad_norm": 0.9162693023681641, + "learning_rate": 1.9211643706727157e-06, + "loss": 0.5767, + "step": 9375 + }, + { + "epoch": 3.461112478464189, + "grad_norm": 0.9143821597099304, + "learning_rate": 1.920691223130769e-06, + "loss": 0.5563, + "step": 9376 + }, + { + "epoch": 3.461481663795225, + "grad_norm": 0.941652238368988, + "learning_rate": 1.9202180975154786e-06, + "loss": 0.608, + "step": 9377 + }, + { + "epoch": 3.4618508491262614, + "grad_norm": 0.9431430697441101, + "learning_rate": 1.9197449938447486e-06, + "loss": 0.602, + "step": 9378 + }, + { + "epoch": 3.4622200344572978, + "grad_norm": 0.9358123540878296, + "learning_rate": 1.9192719121364877e-06, + "loss": 0.5973, + "step": 9379 + }, + { + "epoch": 3.4625892197883337, + "grad_norm": 0.9337643384933472, + "learning_rate": 1.9187988524086026e-06, + "loss": 0.6037, + "step": 9380 + }, + { + "epoch": 3.46295840511937, + "grad_norm": 0.9177729487419128, + "learning_rate": 1.9183258146789966e-06, + "loss": 0.6161, + "step": 9381 + }, + { + "epoch": 3.463327590450406, + "grad_norm": 0.9111480116844177, + "learning_rate": 1.917852798965576e-06, + "loss": 0.5577, + "step": 9382 + }, + { + "epoch": 3.4636967757814423, + "grad_norm": 0.9231239557266235, + "learning_rate": 1.9173798052862425e-06, + "loss": 0.5933, + "step": 9383 + }, + { + "epoch": 3.464065961112478, + "grad_norm": 0.9334813356399536, + "learning_rate": 1.916906833658899e-06, + "loss": 0.619, + "step": 9384 + }, + { + "epoch": 3.4644351464435146, + "grad_norm": 0.9569714069366455, + "learning_rate": 1.9164338841014492e-06, + "loss": 0.5723, + "step": 9385 + }, + { + "epoch": 3.464804331774551, + "grad_norm": 0.9193524718284607, + "learning_rate": 1.9159609566317906e-06, + "loss": 0.5985, + "step": 9386 + }, + { + "epoch": 3.465173517105587, + "grad_norm": 0.9111629724502563, + "learning_rate": 1.9154880512678277e-06, + "loss": 0.5816, + "step": 9387 + }, + { + "epoch": 3.465542702436623, + "grad_norm": 0.9481608271598816, + "learning_rate": 1.915015168027456e-06, + "loss": 0.592, + "step": 9388 + }, + { + "epoch": 3.4659118877676596, + "grad_norm": 0.9090874791145325, + "learning_rate": 1.914542306928576e-06, + "loss": 0.564, + "step": 9389 + }, + { + "epoch": 3.4662810730986955, + "grad_norm": 0.9111278653144836, + "learning_rate": 1.9140694679890854e-06, + "loss": 0.6008, + "step": 9390 + }, + { + "epoch": 3.466650258429732, + "grad_norm": 0.9339438080787659, + "learning_rate": 1.91359665122688e-06, + "loss": 0.6156, + "step": 9391 + }, + { + "epoch": 3.4670194437607678, + "grad_norm": 0.9187940359115601, + "learning_rate": 1.913123856659857e-06, + "loss": 0.5594, + "step": 9392 + }, + { + "epoch": 3.467388629091804, + "grad_norm": 0.9331493973731995, + "learning_rate": 1.91265108430591e-06, + "loss": 0.5644, + "step": 9393 + }, + { + "epoch": 3.46775781442284, + "grad_norm": 0.9474959969520569, + "learning_rate": 1.9121783341829338e-06, + "loss": 0.6137, + "step": 9394 + }, + { + "epoch": 3.4681269997538764, + "grad_norm": 0.9405008554458618, + "learning_rate": 1.9117056063088236e-06, + "loss": 0.5721, + "step": 9395 + }, + { + "epoch": 3.4684961850849128, + "grad_norm": 0.9162678122520447, + "learning_rate": 1.9112329007014686e-06, + "loss": 0.5784, + "step": 9396 + }, + { + "epoch": 3.4688653704159487, + "grad_norm": 0.9364110231399536, + "learning_rate": 1.9107602173787647e-06, + "loss": 0.6124, + "step": 9397 + }, + { + "epoch": 3.469234555746985, + "grad_norm": 0.9204502105712891, + "learning_rate": 1.910287556358599e-06, + "loss": 0.5813, + "step": 9398 + }, + { + "epoch": 3.4696037410780214, + "grad_norm": 0.8954125046730042, + "learning_rate": 1.909814917658863e-06, + "loss": 0.5658, + "step": 9399 + }, + { + "epoch": 3.4699729264090573, + "grad_norm": 0.9047531485557556, + "learning_rate": 1.909342301297447e-06, + "loss": 0.5937, + "step": 9400 + }, + { + "epoch": 3.4703421117400937, + "grad_norm": 0.9498373866081238, + "learning_rate": 1.908869707292238e-06, + "loss": 0.5722, + "step": 9401 + }, + { + "epoch": 3.4707112970711296, + "grad_norm": 0.9350908398628235, + "learning_rate": 1.9083971356611246e-06, + "loss": 0.5976, + "step": 9402 + }, + { + "epoch": 3.471080482402166, + "grad_norm": 0.9277746677398682, + "learning_rate": 1.9079245864219927e-06, + "loss": 0.5877, + "step": 9403 + }, + { + "epoch": 3.471449667733202, + "grad_norm": 0.9602253437042236, + "learning_rate": 1.907452059592727e-06, + "loss": 0.6048, + "step": 9404 + }, + { + "epoch": 3.471818853064238, + "grad_norm": 0.9562898278236389, + "learning_rate": 1.9069795551912162e-06, + "loss": 0.5898, + "step": 9405 + }, + { + "epoch": 3.4721880383952746, + "grad_norm": 0.9324066638946533, + "learning_rate": 1.9065070732353403e-06, + "loss": 0.5781, + "step": 9406 + }, + { + "epoch": 3.4725572237263105, + "grad_norm": 0.922398030757904, + "learning_rate": 1.9060346137429859e-06, + "loss": 0.5987, + "step": 9407 + }, + { + "epoch": 3.472926409057347, + "grad_norm": 0.9100939631462097, + "learning_rate": 1.905562176732032e-06, + "loss": 0.5932, + "step": 9408 + }, + { + "epoch": 3.473295594388383, + "grad_norm": 0.9759135842323303, + "learning_rate": 1.905089762220363e-06, + "loss": 0.6203, + "step": 9409 + }, + { + "epoch": 3.473664779719419, + "grad_norm": 0.9203759431838989, + "learning_rate": 1.904617370225859e-06, + "loss": 0.603, + "step": 9410 + }, + { + "epoch": 3.4740339650504555, + "grad_norm": 0.9412469863891602, + "learning_rate": 1.904145000766399e-06, + "loss": 0.6192, + "step": 9411 + }, + { + "epoch": 3.4744031503814914, + "grad_norm": 0.9298244118690491, + "learning_rate": 1.9036726538598637e-06, + "loss": 0.5933, + "step": 9412 + }, + { + "epoch": 3.4747723357125277, + "grad_norm": 0.9507538676261902, + "learning_rate": 1.9032003295241295e-06, + "loss": 0.5943, + "step": 9413 + }, + { + "epoch": 3.4751415210435637, + "grad_norm": 0.9236178398132324, + "learning_rate": 1.9027280277770737e-06, + "loss": 0.5825, + "step": 9414 + }, + { + "epoch": 3.4755107063746, + "grad_norm": 0.9354607462882996, + "learning_rate": 1.9022557486365755e-06, + "loss": 0.6192, + "step": 9415 + }, + { + "epoch": 3.4758798917056364, + "grad_norm": 0.916000247001648, + "learning_rate": 1.901783492120507e-06, + "loss": 0.607, + "step": 9416 + }, + { + "epoch": 3.4762490770366723, + "grad_norm": 0.9364614486694336, + "learning_rate": 1.9013112582467446e-06, + "loss": 0.5909, + "step": 9417 + }, + { + "epoch": 3.4766182623677087, + "grad_norm": 0.8972388505935669, + "learning_rate": 1.900839047033163e-06, + "loss": 0.5822, + "step": 9418 + }, + { + "epoch": 3.476987447698745, + "grad_norm": 0.9046450853347778, + "learning_rate": 1.9003668584976337e-06, + "loss": 0.6037, + "step": 9419 + }, + { + "epoch": 3.477356633029781, + "grad_norm": 0.948391854763031, + "learning_rate": 1.8998946926580303e-06, + "loss": 0.6246, + "step": 9420 + }, + { + "epoch": 3.4777258183608173, + "grad_norm": 0.917730987071991, + "learning_rate": 1.8994225495322227e-06, + "loss": 0.5879, + "step": 9421 + }, + { + "epoch": 3.478095003691853, + "grad_norm": 0.9617593288421631, + "learning_rate": 1.898950429138082e-06, + "loss": 0.6001, + "step": 9422 + }, + { + "epoch": 3.4784641890228896, + "grad_norm": 0.9514678716659546, + "learning_rate": 1.8984783314934784e-06, + "loss": 0.582, + "step": 9423 + }, + { + "epoch": 3.4788333743539255, + "grad_norm": 0.9012026190757751, + "learning_rate": 1.8980062566162793e-06, + "loss": 0.5539, + "step": 9424 + }, + { + "epoch": 3.479202559684962, + "grad_norm": 0.9389762878417969, + "learning_rate": 1.8975342045243547e-06, + "loss": 0.5975, + "step": 9425 + }, + { + "epoch": 3.479571745015998, + "grad_norm": 0.956527590751648, + "learning_rate": 1.8970621752355689e-06, + "loss": 0.578, + "step": 9426 + }, + { + "epoch": 3.479940930347034, + "grad_norm": 0.9274093508720398, + "learning_rate": 1.89659016876779e-06, + "loss": 0.5957, + "step": 9427 + }, + { + "epoch": 3.4803101156780705, + "grad_norm": 0.8968981504440308, + "learning_rate": 1.8961181851388833e-06, + "loss": 0.5646, + "step": 9428 + }, + { + "epoch": 3.4806793010091064, + "grad_norm": 0.9521125555038452, + "learning_rate": 1.8956462243667118e-06, + "loss": 0.5748, + "step": 9429 + }, + { + "epoch": 3.4810484863401427, + "grad_norm": 0.9478389620780945, + "learning_rate": 1.8951742864691408e-06, + "loss": 0.623, + "step": 9430 + }, + { + "epoch": 3.481417671671179, + "grad_norm": 0.9159737825393677, + "learning_rate": 1.8947023714640316e-06, + "loss": 0.6004, + "step": 9431 + }, + { + "epoch": 3.481786857002215, + "grad_norm": 0.9462550282478333, + "learning_rate": 1.8942304793692468e-06, + "loss": 0.5838, + "step": 9432 + }, + { + "epoch": 3.4821560423332514, + "grad_norm": 0.9346432685852051, + "learning_rate": 1.8937586102026475e-06, + "loss": 0.5951, + "step": 9433 + }, + { + "epoch": 3.4825252276642873, + "grad_norm": 0.9263037443161011, + "learning_rate": 1.8932867639820927e-06, + "loss": 0.6021, + "step": 9434 + }, + { + "epoch": 3.4828944129953237, + "grad_norm": 0.9909936785697937, + "learning_rate": 1.8928149407254438e-06, + "loss": 0.5972, + "step": 9435 + }, + { + "epoch": 3.48326359832636, + "grad_norm": 0.9057047367095947, + "learning_rate": 1.892343140450556e-06, + "loss": 0.6054, + "step": 9436 + }, + { + "epoch": 3.483632783657396, + "grad_norm": 0.9152354001998901, + "learning_rate": 1.891871363175289e-06, + "loss": 0.5828, + "step": 9437 + }, + { + "epoch": 3.4840019689884323, + "grad_norm": 0.8986968398094177, + "learning_rate": 1.8913996089174998e-06, + "loss": 0.6109, + "step": 9438 + }, + { + "epoch": 3.484371154319468, + "grad_norm": 0.8934596180915833, + "learning_rate": 1.890927877695043e-06, + "loss": 0.596, + "step": 9439 + }, + { + "epoch": 3.4847403396505046, + "grad_norm": 0.9073708057403564, + "learning_rate": 1.890456169525774e-06, + "loss": 0.5943, + "step": 9440 + }, + { + "epoch": 3.485109524981541, + "grad_norm": 0.9223779439926147, + "learning_rate": 1.8899844844275462e-06, + "loss": 0.5614, + "step": 9441 + }, + { + "epoch": 3.485478710312577, + "grad_norm": 0.9174028635025024, + "learning_rate": 1.889512822418213e-06, + "loss": 0.596, + "step": 9442 + }, + { + "epoch": 3.485847895643613, + "grad_norm": 0.9504607319831848, + "learning_rate": 1.8890411835156271e-06, + "loss": 0.592, + "step": 9443 + }, + { + "epoch": 3.486217080974649, + "grad_norm": 0.9218973517417908, + "learning_rate": 1.888569567737639e-06, + "loss": 0.5429, + "step": 9444 + }, + { + "epoch": 3.4865862663056855, + "grad_norm": 1.036028265953064, + "learning_rate": 1.888097975102101e-06, + "loss": 0.581, + "step": 9445 + }, + { + "epoch": 3.486955451636722, + "grad_norm": 0.9268807768821716, + "learning_rate": 1.8876264056268604e-06, + "loss": 0.6043, + "step": 9446 + }, + { + "epoch": 3.4873246369677577, + "grad_norm": 0.9124754667282104, + "learning_rate": 1.8871548593297672e-06, + "loss": 0.5764, + "step": 9447 + }, + { + "epoch": 3.487693822298794, + "grad_norm": 0.9269412159919739, + "learning_rate": 1.8866833362286699e-06, + "loss": 0.5792, + "step": 9448 + }, + { + "epoch": 3.48806300762983, + "grad_norm": 0.9049989581108093, + "learning_rate": 1.8862118363414142e-06, + "loss": 0.5892, + "step": 9449 + }, + { + "epoch": 3.4884321929608664, + "grad_norm": 0.9759523868560791, + "learning_rate": 1.8857403596858472e-06, + "loss": 0.6165, + "step": 9450 + }, + { + "epoch": 3.4888013782919023, + "grad_norm": 0.9375272393226624, + "learning_rate": 1.8852689062798131e-06, + "loss": 0.599, + "step": 9451 + }, + { + "epoch": 3.4891705636229386, + "grad_norm": 0.9440256357192993, + "learning_rate": 1.8847974761411573e-06, + "loss": 0.5989, + "step": 9452 + }, + { + "epoch": 3.489539748953975, + "grad_norm": 0.9244765043258667, + "learning_rate": 1.884326069287723e-06, + "loss": 0.6044, + "step": 9453 + }, + { + "epoch": 3.489908934285011, + "grad_norm": 0.9307211637496948, + "learning_rate": 1.8838546857373518e-06, + "loss": 0.5931, + "step": 9454 + }, + { + "epoch": 3.4902781196160473, + "grad_norm": 0.9309192299842834, + "learning_rate": 1.8833833255078867e-06, + "loss": 0.6035, + "step": 9455 + }, + { + "epoch": 3.4906473049470836, + "grad_norm": 0.9521573185920715, + "learning_rate": 1.8829119886171688e-06, + "loss": 0.6016, + "step": 9456 + }, + { + "epoch": 3.4910164902781196, + "grad_norm": 0.9416539072990417, + "learning_rate": 1.8824406750830368e-06, + "loss": 0.5957, + "step": 9457 + }, + { + "epoch": 3.491385675609156, + "grad_norm": 0.9302670359611511, + "learning_rate": 1.8819693849233308e-06, + "loss": 0.5663, + "step": 9458 + }, + { + "epoch": 3.491754860940192, + "grad_norm": 0.9624859690666199, + "learning_rate": 1.881498118155888e-06, + "loss": 0.5983, + "step": 9459 + }, + { + "epoch": 3.492124046271228, + "grad_norm": 0.9377179741859436, + "learning_rate": 1.8810268747985464e-06, + "loss": 0.5852, + "step": 9460 + }, + { + "epoch": 3.492493231602264, + "grad_norm": 0.9387195706367493, + "learning_rate": 1.8805556548691425e-06, + "loss": 0.6064, + "step": 9461 + }, + { + "epoch": 3.4928624169333005, + "grad_norm": 0.9075819253921509, + "learning_rate": 1.8800844583855107e-06, + "loss": 0.5823, + "step": 9462 + }, + { + "epoch": 3.493231602264337, + "grad_norm": 0.9500899314880371, + "learning_rate": 1.8796132853654881e-06, + "loss": 0.598, + "step": 9463 + }, + { + "epoch": 3.4936007875953727, + "grad_norm": 0.9237416982650757, + "learning_rate": 1.8791421358269051e-06, + "loss": 0.6186, + "step": 9464 + }, + { + "epoch": 3.493969972926409, + "grad_norm": 0.8788281083106995, + "learning_rate": 1.8786710097875972e-06, + "loss": 0.5436, + "step": 9465 + }, + { + "epoch": 3.4943391582574455, + "grad_norm": 0.9296416640281677, + "learning_rate": 1.8781999072653956e-06, + "loss": 0.6101, + "step": 9466 + }, + { + "epoch": 3.4947083435884814, + "grad_norm": 0.9483667612075806, + "learning_rate": 1.877728828278131e-06, + "loss": 0.6188, + "step": 9467 + }, + { + "epoch": 3.4950775289195177, + "grad_norm": 0.9171264171600342, + "learning_rate": 1.8772577728436342e-06, + "loss": 0.6046, + "step": 9468 + }, + { + "epoch": 3.4954467142505536, + "grad_norm": 0.9284828901290894, + "learning_rate": 1.8767867409797335e-06, + "loss": 0.5485, + "step": 9469 + }, + { + "epoch": 3.49581589958159, + "grad_norm": 0.9159625172615051, + "learning_rate": 1.8763157327042584e-06, + "loss": 0.5824, + "step": 9470 + }, + { + "epoch": 3.496185084912626, + "grad_norm": 0.9325279593467712, + "learning_rate": 1.875844748035036e-06, + "loss": 0.5975, + "step": 9471 + }, + { + "epoch": 3.4965542702436623, + "grad_norm": 0.9310780167579651, + "learning_rate": 1.8753737869898921e-06, + "loss": 0.5789, + "step": 9472 + }, + { + "epoch": 3.4969234555746986, + "grad_norm": 0.9311102628707886, + "learning_rate": 1.8749028495866551e-06, + "loss": 0.5884, + "step": 9473 + }, + { + "epoch": 3.4972926409057346, + "grad_norm": 0.924474835395813, + "learning_rate": 1.8744319358431458e-06, + "loss": 0.5928, + "step": 9474 + }, + { + "epoch": 3.497661826236771, + "grad_norm": 0.9199841618537903, + "learning_rate": 1.8739610457771913e-06, + "loss": 0.569, + "step": 9475 + }, + { + "epoch": 3.4980310115678073, + "grad_norm": 0.9265853762626648, + "learning_rate": 1.8734901794066141e-06, + "loss": 0.5654, + "step": 9476 + }, + { + "epoch": 3.498400196898843, + "grad_norm": 0.9634745121002197, + "learning_rate": 1.8730193367492349e-06, + "loss": 0.6379, + "step": 9477 + }, + { + "epoch": 3.4987693822298795, + "grad_norm": 0.9458030462265015, + "learning_rate": 1.872548517822877e-06, + "loss": 0.6207, + "step": 9478 + }, + { + "epoch": 3.4991385675609155, + "grad_norm": 0.9147154092788696, + "learning_rate": 1.872077722645359e-06, + "loss": 0.5252, + "step": 9479 + }, + { + "epoch": 3.499507752891952, + "grad_norm": 0.9521554708480835, + "learning_rate": 1.8716069512345008e-06, + "loss": 0.6003, + "step": 9480 + }, + { + "epoch": 3.4998769382229877, + "grad_norm": 0.9284259080886841, + "learning_rate": 1.8711362036081221e-06, + "loss": 0.5872, + "step": 9481 + }, + { + "epoch": 3.500246123554024, + "grad_norm": 0.908194899559021, + "learning_rate": 1.8706654797840382e-06, + "loss": 0.6052, + "step": 9482 + }, + { + "epoch": 3.5006153088850605, + "grad_norm": 0.9610242247581482, + "learning_rate": 1.870194779780069e-06, + "loss": 0.6019, + "step": 9483 + }, + { + "epoch": 3.5009844942160964, + "grad_norm": 0.9240999221801758, + "learning_rate": 1.869724103614027e-06, + "loss": 0.5824, + "step": 9484 + }, + { + "epoch": 3.5013536795471327, + "grad_norm": 0.9334954023361206, + "learning_rate": 1.8692534513037291e-06, + "loss": 0.6374, + "step": 9485 + }, + { + "epoch": 3.501722864878169, + "grad_norm": 0.9125884175300598, + "learning_rate": 1.8687828228669898e-06, + "loss": 0.5852, + "step": 9486 + }, + { + "epoch": 3.502092050209205, + "grad_norm": 0.9158222079277039, + "learning_rate": 1.868312218321621e-06, + "loss": 0.6061, + "step": 9487 + }, + { + "epoch": 3.5024612355402414, + "grad_norm": 0.9221826195716858, + "learning_rate": 1.8678416376854359e-06, + "loss": 0.5685, + "step": 9488 + }, + { + "epoch": 3.5028304208712773, + "grad_norm": 0.9375458359718323, + "learning_rate": 1.8673710809762447e-06, + "loss": 0.5568, + "step": 9489 + }, + { + "epoch": 3.5031996062023136, + "grad_norm": 0.9457985162734985, + "learning_rate": 1.866900548211858e-06, + "loss": 0.5995, + "step": 9490 + }, + { + "epoch": 3.5035687915333495, + "grad_norm": 0.9261751770973206, + "learning_rate": 1.8664300394100868e-06, + "loss": 0.5355, + "step": 9491 + }, + { + "epoch": 3.503937976864386, + "grad_norm": 0.928068995475769, + "learning_rate": 1.8659595545887375e-06, + "loss": 0.5874, + "step": 9492 + }, + { + "epoch": 3.5043071621954223, + "grad_norm": 0.9096341133117676, + "learning_rate": 1.8654890937656192e-06, + "loss": 0.5792, + "step": 9493 + }, + { + "epoch": 3.504676347526458, + "grad_norm": 0.9201680421829224, + "learning_rate": 1.8650186569585394e-06, + "loss": 0.577, + "step": 9494 + }, + { + "epoch": 3.5050455328574945, + "grad_norm": 0.9253438711166382, + "learning_rate": 1.864548244185302e-06, + "loss": 0.5894, + "step": 9495 + }, + { + "epoch": 3.505414718188531, + "grad_norm": 0.9166371822357178, + "learning_rate": 1.864077855463714e-06, + "loss": 0.5931, + "step": 9496 + }, + { + "epoch": 3.505783903519567, + "grad_norm": 0.9379866719245911, + "learning_rate": 1.8636074908115775e-06, + "loss": 0.596, + "step": 9497 + }, + { + "epoch": 3.5061530888506027, + "grad_norm": 0.9691398739814758, + "learning_rate": 1.8631371502466972e-06, + "loss": 0.6011, + "step": 9498 + }, + { + "epoch": 3.506522274181639, + "grad_norm": 0.9251505136489868, + "learning_rate": 1.8626668337868748e-06, + "loss": 0.6046, + "step": 9499 + }, + { + "epoch": 3.5068914595126754, + "grad_norm": 0.9305524230003357, + "learning_rate": 1.862196541449911e-06, + "loss": 0.5953, + "step": 9500 + }, + { + "epoch": 3.5072606448437114, + "grad_norm": 0.9265029430389404, + "learning_rate": 1.8617262732536078e-06, + "loss": 0.5974, + "step": 9501 + }, + { + "epoch": 3.5076298301747477, + "grad_norm": 0.9202384352684021, + "learning_rate": 1.8612560292157623e-06, + "loss": 0.5573, + "step": 9502 + }, + { + "epoch": 3.507999015505784, + "grad_norm": 0.9170202612876892, + "learning_rate": 1.8607858093541753e-06, + "loss": 0.5943, + "step": 9503 + }, + { + "epoch": 3.50836820083682, + "grad_norm": 0.9127691388130188, + "learning_rate": 1.860315613686644e-06, + "loss": 0.591, + "step": 9504 + }, + { + "epoch": 3.5087373861678564, + "grad_norm": 0.9375178813934326, + "learning_rate": 1.8598454422309641e-06, + "loss": 0.6041, + "step": 9505 + }, + { + "epoch": 3.5091065714988927, + "grad_norm": 0.9225568175315857, + "learning_rate": 1.8593752950049331e-06, + "loss": 0.5743, + "step": 9506 + }, + { + "epoch": 3.5094757568299286, + "grad_norm": 0.9428280591964722, + "learning_rate": 1.8589051720263447e-06, + "loss": 0.6026, + "step": 9507 + }, + { + "epoch": 3.5098449421609645, + "grad_norm": 0.8982357382774353, + "learning_rate": 1.8584350733129924e-06, + "loss": 0.5546, + "step": 9508 + }, + { + "epoch": 3.510214127492001, + "grad_norm": 0.8950639367103577, + "learning_rate": 1.8579649988826716e-06, + "loss": 0.5717, + "step": 9509 + }, + { + "epoch": 3.5105833128230373, + "grad_norm": 0.9397900104522705, + "learning_rate": 1.8574949487531718e-06, + "loss": 0.585, + "step": 9510 + }, + { + "epoch": 3.510952498154073, + "grad_norm": 0.903022825717926, + "learning_rate": 1.857024922942286e-06, + "loss": 0.5823, + "step": 9511 + }, + { + "epoch": 3.5113216834851095, + "grad_norm": 0.9201186299324036, + "learning_rate": 1.8565549214678027e-06, + "loss": 0.5953, + "step": 9512 + }, + { + "epoch": 3.511690868816146, + "grad_norm": 0.9491259455680847, + "learning_rate": 1.8560849443475134e-06, + "loss": 0.5898, + "step": 9513 + }, + { + "epoch": 3.512060054147182, + "grad_norm": 0.9305737614631653, + "learning_rate": 1.855614991599206e-06, + "loss": 0.6116, + "step": 9514 + }, + { + "epoch": 3.512429239478218, + "grad_norm": 0.9230583906173706, + "learning_rate": 1.8551450632406676e-06, + "loss": 0.5626, + "step": 9515 + }, + { + "epoch": 3.512798424809254, + "grad_norm": 0.925311803817749, + "learning_rate": 1.8546751592896855e-06, + "loss": 0.5981, + "step": 9516 + }, + { + "epoch": 3.5131676101402904, + "grad_norm": 0.9164520502090454, + "learning_rate": 1.8542052797640442e-06, + "loss": 0.5828, + "step": 9517 + }, + { + "epoch": 3.5135367954713264, + "grad_norm": 0.8951064348220825, + "learning_rate": 1.8537354246815296e-06, + "loss": 0.5888, + "step": 9518 + }, + { + "epoch": 3.5139059808023627, + "grad_norm": 0.8977219462394714, + "learning_rate": 1.8532655940599253e-06, + "loss": 0.5824, + "step": 9519 + }, + { + "epoch": 3.514275166133399, + "grad_norm": 0.904502272605896, + "learning_rate": 1.8527957879170139e-06, + "loss": 0.5544, + "step": 9520 + }, + { + "epoch": 3.514644351464435, + "grad_norm": 0.9508926868438721, + "learning_rate": 1.8523260062705783e-06, + "loss": 0.623, + "step": 9521 + }, + { + "epoch": 3.5150135367954713, + "grad_norm": 0.9308645129203796, + "learning_rate": 1.8518562491383976e-06, + "loss": 0.5936, + "step": 9522 + }, + { + "epoch": 3.5153827221265077, + "grad_norm": 0.9300792217254639, + "learning_rate": 1.8513865165382538e-06, + "loss": 0.5909, + "step": 9523 + }, + { + "epoch": 3.5157519074575436, + "grad_norm": 0.9275345802307129, + "learning_rate": 1.8509168084879265e-06, + "loss": 0.5843, + "step": 9524 + }, + { + "epoch": 3.51612109278858, + "grad_norm": 0.8908158540725708, + "learning_rate": 1.8504471250051926e-06, + "loss": 0.5894, + "step": 9525 + }, + { + "epoch": 3.516490278119616, + "grad_norm": 0.9163076877593994, + "learning_rate": 1.8499774661078295e-06, + "loss": 0.5593, + "step": 9526 + }, + { + "epoch": 3.5168594634506523, + "grad_norm": 0.9581574201583862, + "learning_rate": 1.8495078318136153e-06, + "loss": 0.6013, + "step": 9527 + }, + { + "epoch": 3.517228648781688, + "grad_norm": 0.9602099061012268, + "learning_rate": 1.8490382221403237e-06, + "loss": 0.6053, + "step": 9528 + }, + { + "epoch": 3.5175978341127245, + "grad_norm": 0.9325435757637024, + "learning_rate": 1.8485686371057305e-06, + "loss": 0.6067, + "step": 9529 + }, + { + "epoch": 3.517967019443761, + "grad_norm": 0.9367010593414307, + "learning_rate": 1.848099076727608e-06, + "loss": 0.6252, + "step": 9530 + }, + { + "epoch": 3.518336204774797, + "grad_norm": 0.9426878094673157, + "learning_rate": 1.8476295410237294e-06, + "loss": 0.5871, + "step": 9531 + }, + { + "epoch": 3.518705390105833, + "grad_norm": 0.9480910301208496, + "learning_rate": 1.8471600300118675e-06, + "loss": 0.6262, + "step": 9532 + }, + { + "epoch": 3.5190745754368695, + "grad_norm": 0.9581538438796997, + "learning_rate": 1.8466905437097922e-06, + "loss": 0.5988, + "step": 9533 + }, + { + "epoch": 3.5194437607679054, + "grad_norm": 0.9281653165817261, + "learning_rate": 1.8462210821352742e-06, + "loss": 0.5953, + "step": 9534 + }, + { + "epoch": 3.519812946098942, + "grad_norm": 0.9104658365249634, + "learning_rate": 1.8457516453060815e-06, + "loss": 0.5731, + "step": 9535 + }, + { + "epoch": 3.5201821314299777, + "grad_norm": 0.8989147543907166, + "learning_rate": 1.8452822332399822e-06, + "loss": 0.572, + "step": 9536 + }, + { + "epoch": 3.520551316761014, + "grad_norm": 0.9902005791664124, + "learning_rate": 1.8448128459547442e-06, + "loss": 0.5822, + "step": 9537 + }, + { + "epoch": 3.52092050209205, + "grad_norm": 0.9406919479370117, + "learning_rate": 1.8443434834681328e-06, + "loss": 0.6198, + "step": 9538 + }, + { + "epoch": 3.5212896874230863, + "grad_norm": 0.9445244073867798, + "learning_rate": 1.8438741457979143e-06, + "loss": 0.6256, + "step": 9539 + }, + { + "epoch": 3.5216588727541227, + "grad_norm": 0.9289875030517578, + "learning_rate": 1.8434048329618518e-06, + "loss": 0.5982, + "step": 9540 + }, + { + "epoch": 3.5220280580851586, + "grad_norm": 0.92833411693573, + "learning_rate": 1.8429355449777087e-06, + "loss": 0.5673, + "step": 9541 + }, + { + "epoch": 3.522397243416195, + "grad_norm": 0.9345184564590454, + "learning_rate": 1.8424662818632485e-06, + "loss": 0.6111, + "step": 9542 + }, + { + "epoch": 3.5227664287472313, + "grad_norm": 0.9337489008903503, + "learning_rate": 1.8419970436362317e-06, + "loss": 0.5927, + "step": 9543 + }, + { + "epoch": 3.5231356140782673, + "grad_norm": 0.9032337665557861, + "learning_rate": 1.8415278303144197e-06, + "loss": 0.5709, + "step": 9544 + }, + { + "epoch": 3.5235047994093036, + "grad_norm": 0.9254283905029297, + "learning_rate": 1.8410586419155707e-06, + "loss": 0.5704, + "step": 9545 + }, + { + "epoch": 3.5238739847403395, + "grad_norm": 0.9400042295455933, + "learning_rate": 1.8405894784574445e-06, + "loss": 0.6492, + "step": 9546 + }, + { + "epoch": 3.524243170071376, + "grad_norm": 0.9611815214157104, + "learning_rate": 1.8401203399577988e-06, + "loss": 0.6107, + "step": 9547 + }, + { + "epoch": 3.524612355402412, + "grad_norm": 0.9359035491943359, + "learning_rate": 1.8396512264343897e-06, + "loss": 0.5855, + "step": 9548 + }, + { + "epoch": 3.524981540733448, + "grad_norm": 0.9713009595870972, + "learning_rate": 1.8391821379049734e-06, + "loss": 0.6108, + "step": 9549 + }, + { + "epoch": 3.5253507260644845, + "grad_norm": 0.9576222896575928, + "learning_rate": 1.8387130743873044e-06, + "loss": 0.6133, + "step": 9550 + }, + { + "epoch": 3.5257199113955204, + "grad_norm": 0.9116846323013306, + "learning_rate": 1.838244035899136e-06, + "loss": 0.5699, + "step": 9551 + }, + { + "epoch": 3.526089096726557, + "grad_norm": 0.9543426036834717, + "learning_rate": 1.8377750224582233e-06, + "loss": 0.5716, + "step": 9552 + }, + { + "epoch": 3.526458282057593, + "grad_norm": 0.9254000782966614, + "learning_rate": 1.8373060340823162e-06, + "loss": 0.5639, + "step": 9553 + }, + { + "epoch": 3.526827467388629, + "grad_norm": 0.8847162127494812, + "learning_rate": 1.8368370707891676e-06, + "loss": 0.5771, + "step": 9554 + }, + { + "epoch": 3.5271966527196654, + "grad_norm": 0.9201096296310425, + "learning_rate": 1.8363681325965254e-06, + "loss": 0.5633, + "step": 9555 + }, + { + "epoch": 3.5275658380507013, + "grad_norm": 0.9702960848808289, + "learning_rate": 1.8358992195221404e-06, + "loss": 0.6255, + "step": 9556 + }, + { + "epoch": 3.5279350233817377, + "grad_norm": 1.1582074165344238, + "learning_rate": 1.8354303315837608e-06, + "loss": 0.5566, + "step": 9557 + }, + { + "epoch": 3.5283042087127736, + "grad_norm": 0.9326962232589722, + "learning_rate": 1.834961468799133e-06, + "loss": 0.6176, + "step": 9558 + }, + { + "epoch": 3.52867339404381, + "grad_norm": 0.9777815341949463, + "learning_rate": 1.834492631186004e-06, + "loss": 0.5829, + "step": 9559 + }, + { + "epoch": 3.5290425793748463, + "grad_norm": 0.9037983417510986, + "learning_rate": 1.8340238187621185e-06, + "loss": 0.5444, + "step": 9560 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.9631421566009521, + "learning_rate": 1.8335550315452204e-06, + "loss": 0.5911, + "step": 9561 + }, + { + "epoch": 3.5297809500369186, + "grad_norm": 0.9246695637702942, + "learning_rate": 1.8330862695530555e-06, + "loss": 0.5898, + "step": 9562 + }, + { + "epoch": 3.530150135367955, + "grad_norm": 0.9449991583824158, + "learning_rate": 1.832617532803364e-06, + "loss": 0.619, + "step": 9563 + }, + { + "epoch": 3.530519320698991, + "grad_norm": 0.9153415560722351, + "learning_rate": 1.8321488213138884e-06, + "loss": 0.583, + "step": 9564 + }, + { + "epoch": 3.530888506030027, + "grad_norm": 0.9319022297859192, + "learning_rate": 1.8316801351023695e-06, + "loss": 0.6134, + "step": 9565 + }, + { + "epoch": 3.531257691361063, + "grad_norm": 0.9099746346473694, + "learning_rate": 1.8312114741865461e-06, + "loss": 0.5943, + "step": 9566 + }, + { + "epoch": 3.5316268766920995, + "grad_norm": 0.9323531985282898, + "learning_rate": 1.830742838584158e-06, + "loss": 0.59, + "step": 9567 + }, + { + "epoch": 3.5319960620231354, + "grad_norm": 0.9377449750900269, + "learning_rate": 1.8302742283129416e-06, + "loss": 0.6301, + "step": 9568 + }, + { + "epoch": 3.532365247354172, + "grad_norm": 0.9063496589660645, + "learning_rate": 1.8298056433906339e-06, + "loss": 0.5829, + "step": 9569 + }, + { + "epoch": 3.532734432685208, + "grad_norm": 0.9414238333702087, + "learning_rate": 1.8293370838349724e-06, + "loss": 0.5857, + "step": 9570 + }, + { + "epoch": 3.533103618016244, + "grad_norm": 0.922554075717926, + "learning_rate": 1.8288685496636894e-06, + "loss": 0.5885, + "step": 9571 + }, + { + "epoch": 3.5334728033472804, + "grad_norm": 0.9124034643173218, + "learning_rate": 1.8284000408945207e-06, + "loss": 0.6376, + "step": 9572 + }, + { + "epoch": 3.533841988678317, + "grad_norm": 0.9025173783302307, + "learning_rate": 1.8279315575451982e-06, + "loss": 0.6198, + "step": 9573 + }, + { + "epoch": 3.5342111740093527, + "grad_norm": 0.9635813236236572, + "learning_rate": 1.8274630996334542e-06, + "loss": 0.5615, + "step": 9574 + }, + { + "epoch": 3.5345803593403886, + "grad_norm": 0.9088558554649353, + "learning_rate": 1.8269946671770198e-06, + "loss": 0.5702, + "step": 9575 + }, + { + "epoch": 3.534949544671425, + "grad_norm": 0.9250110983848572, + "learning_rate": 1.8265262601936245e-06, + "loss": 0.5966, + "step": 9576 + }, + { + "epoch": 3.5353187300024613, + "grad_norm": 0.9276784658432007, + "learning_rate": 1.826057878700998e-06, + "loss": 0.5785, + "step": 9577 + }, + { + "epoch": 3.5356879153334972, + "grad_norm": 0.9293143153190613, + "learning_rate": 1.825589522716868e-06, + "loss": 0.5705, + "step": 9578 + }, + { + "epoch": 3.5360571006645336, + "grad_norm": 0.9486907124519348, + "learning_rate": 1.825121192258961e-06, + "loss": 0.6221, + "step": 9579 + }, + { + "epoch": 3.53642628599557, + "grad_norm": 0.9497941732406616, + "learning_rate": 1.8246528873450054e-06, + "loss": 0.5925, + "step": 9580 + }, + { + "epoch": 3.536795471326606, + "grad_norm": 0.9442281723022461, + "learning_rate": 1.8241846079927233e-06, + "loss": 0.5752, + "step": 9581 + }, + { + "epoch": 3.5371646566576422, + "grad_norm": 0.9256633520126343, + "learning_rate": 1.8237163542198415e-06, + "loss": 0.5835, + "step": 9582 + }, + { + "epoch": 3.537533841988678, + "grad_norm": 0.9407906532287598, + "learning_rate": 1.823248126044082e-06, + "loss": 0.6167, + "step": 9583 + }, + { + "epoch": 3.5379030273197145, + "grad_norm": 0.9185263514518738, + "learning_rate": 1.8227799234831671e-06, + "loss": 0.5821, + "step": 9584 + }, + { + "epoch": 3.5382722126507504, + "grad_norm": 0.9573516249656677, + "learning_rate": 1.822311746554819e-06, + "loss": 0.5855, + "step": 9585 + }, + { + "epoch": 3.538641397981787, + "grad_norm": 0.9217113256454468, + "learning_rate": 1.8218435952767565e-06, + "loss": 0.6124, + "step": 9586 + }, + { + "epoch": 3.539010583312823, + "grad_norm": 0.9299331903457642, + "learning_rate": 1.8213754696667008e-06, + "loss": 0.6276, + "step": 9587 + }, + { + "epoch": 3.539379768643859, + "grad_norm": 0.9284979104995728, + "learning_rate": 1.820907369742369e-06, + "loss": 0.5912, + "step": 9588 + }, + { + "epoch": 3.5397489539748954, + "grad_norm": 0.9440866708755493, + "learning_rate": 1.8204392955214783e-06, + "loss": 0.6008, + "step": 9589 + }, + { + "epoch": 3.5401181393059318, + "grad_norm": 0.9362778663635254, + "learning_rate": 1.8199712470217473e-06, + "loss": 0.5669, + "step": 9590 + }, + { + "epoch": 3.5404873246369677, + "grad_norm": 0.9393783211708069, + "learning_rate": 1.8195032242608884e-06, + "loss": 0.5939, + "step": 9591 + }, + { + "epoch": 3.540856509968004, + "grad_norm": 0.9372497200965881, + "learning_rate": 1.8190352272566186e-06, + "loss": 0.6071, + "step": 9592 + }, + { + "epoch": 3.54122569529904, + "grad_norm": 0.9263262152671814, + "learning_rate": 1.81856725602665e-06, + "loss": 0.5777, + "step": 9593 + }, + { + "epoch": 3.5415948806300763, + "grad_norm": 0.9215299487113953, + "learning_rate": 1.818099310588696e-06, + "loss": 0.5817, + "step": 9594 + }, + { + "epoch": 3.5419640659611122, + "grad_norm": 0.9510741233825684, + "learning_rate": 1.8176313909604681e-06, + "loss": 0.598, + "step": 9595 + }, + { + "epoch": 3.5423332512921486, + "grad_norm": 0.9517301321029663, + "learning_rate": 1.8171634971596763e-06, + "loss": 0.6167, + "step": 9596 + }, + { + "epoch": 3.542702436623185, + "grad_norm": 0.957533597946167, + "learning_rate": 1.816695629204031e-06, + "loss": 0.6048, + "step": 9597 + }, + { + "epoch": 3.543071621954221, + "grad_norm": 0.9050398468971252, + "learning_rate": 1.8162277871112404e-06, + "loss": 0.5424, + "step": 9598 + }, + { + "epoch": 3.5434408072852572, + "grad_norm": 0.9155235886573792, + "learning_rate": 1.8157599708990114e-06, + "loss": 0.5591, + "step": 9599 + }, + { + "epoch": 3.5438099926162936, + "grad_norm": 0.9178012609481812, + "learning_rate": 1.815292180585053e-06, + "loss": 0.5874, + "step": 9600 + }, + { + "epoch": 3.5441791779473295, + "grad_norm": 0.9293293356895447, + "learning_rate": 1.814824416187068e-06, + "loss": 0.5992, + "step": 9601 + }, + { + "epoch": 3.544548363278366, + "grad_norm": 0.9334662556648254, + "learning_rate": 1.8143566777227633e-06, + "loss": 0.6027, + "step": 9602 + }, + { + "epoch": 3.544917548609402, + "grad_norm": 0.9153474569320679, + "learning_rate": 1.8138889652098425e-06, + "loss": 0.5719, + "step": 9603 + }, + { + "epoch": 3.545286733940438, + "grad_norm": 0.920801043510437, + "learning_rate": 1.8134212786660072e-06, + "loss": 0.6032, + "step": 9604 + }, + { + "epoch": 3.545655919271474, + "grad_norm": 0.9147272706031799, + "learning_rate": 1.8129536181089601e-06, + "loss": 0.5717, + "step": 9605 + }, + { + "epoch": 3.5460251046025104, + "grad_norm": 0.9174989461898804, + "learning_rate": 1.8124859835564018e-06, + "loss": 0.5752, + "step": 9606 + }, + { + "epoch": 3.5463942899335468, + "grad_norm": 0.938650369644165, + "learning_rate": 1.8120183750260313e-06, + "loss": 0.5991, + "step": 9607 + }, + { + "epoch": 3.5467634752645827, + "grad_norm": 0.9331759214401245, + "learning_rate": 1.811550792535549e-06, + "loss": 0.6313, + "step": 9608 + }, + { + "epoch": 3.547132660595619, + "grad_norm": 0.925061821937561, + "learning_rate": 1.8110832361026512e-06, + "loss": 0.5996, + "step": 9609 + }, + { + "epoch": 3.5475018459266554, + "grad_norm": 0.9530702829360962, + "learning_rate": 1.8106157057450368e-06, + "loss": 0.6168, + "step": 9610 + }, + { + "epoch": 3.5478710312576913, + "grad_norm": 0.941872775554657, + "learning_rate": 1.8101482014803988e-06, + "loss": 0.5894, + "step": 9611 + }, + { + "epoch": 3.5482402165887277, + "grad_norm": 0.9101320505142212, + "learning_rate": 1.809680723326434e-06, + "loss": 0.5833, + "step": 9612 + }, + { + "epoch": 3.5486094019197636, + "grad_norm": 0.9157885313034058, + "learning_rate": 1.8092132713008368e-06, + "loss": 0.5804, + "step": 9613 + }, + { + "epoch": 3.5489785872508, + "grad_norm": 0.9283676147460938, + "learning_rate": 1.8087458454212986e-06, + "loss": 0.5843, + "step": 9614 + }, + { + "epoch": 3.549347772581836, + "grad_norm": 0.9593942761421204, + "learning_rate": 1.8082784457055125e-06, + "loss": 0.5787, + "step": 9615 + }, + { + "epoch": 3.5497169579128722, + "grad_norm": 0.9533437490463257, + "learning_rate": 1.8078110721711684e-06, + "loss": 0.5919, + "step": 9616 + }, + { + "epoch": 3.5500861432439086, + "grad_norm": 0.9162729382514954, + "learning_rate": 1.807343724835956e-06, + "loss": 0.5943, + "step": 9617 + }, + { + "epoch": 3.5504553285749445, + "grad_norm": 0.9186127781867981, + "learning_rate": 1.8068764037175668e-06, + "loss": 0.5782, + "step": 9618 + }, + { + "epoch": 3.550824513905981, + "grad_norm": 0.9509430527687073, + "learning_rate": 1.8064091088336854e-06, + "loss": 0.6114, + "step": 9619 + }, + { + "epoch": 3.551193699237017, + "grad_norm": 0.9413036108016968, + "learning_rate": 1.805941840202002e-06, + "loss": 0.6128, + "step": 9620 + }, + { + "epoch": 3.551562884568053, + "grad_norm": 0.933337926864624, + "learning_rate": 1.8054745978401995e-06, + "loss": 0.5862, + "step": 9621 + }, + { + "epoch": 3.5519320698990895, + "grad_norm": 0.8897302150726318, + "learning_rate": 1.8050073817659641e-06, + "loss": 0.5771, + "step": 9622 + }, + { + "epoch": 3.5523012552301254, + "grad_norm": 0.9145559668540955, + "learning_rate": 1.8045401919969813e-06, + "loss": 0.5847, + "step": 9623 + }, + { + "epoch": 3.5526704405611618, + "grad_norm": 0.9199747443199158, + "learning_rate": 1.8040730285509316e-06, + "loss": 0.566, + "step": 9624 + }, + { + "epoch": 3.5530396258921977, + "grad_norm": 0.9362497925758362, + "learning_rate": 1.803605891445499e-06, + "loss": 0.5912, + "step": 9625 + }, + { + "epoch": 3.553408811223234, + "grad_norm": 0.8828123211860657, + "learning_rate": 1.803138780698363e-06, + "loss": 0.5304, + "step": 9626 + }, + { + "epoch": 3.5537779965542704, + "grad_norm": 0.9237546920776367, + "learning_rate": 1.8026716963272035e-06, + "loss": 0.5832, + "step": 9627 + }, + { + "epoch": 3.5541471818853063, + "grad_norm": 0.9592643976211548, + "learning_rate": 1.8022046383497022e-06, + "loss": 0.6016, + "step": 9628 + }, + { + "epoch": 3.5545163672163427, + "grad_norm": 0.9313174486160278, + "learning_rate": 1.8017376067835333e-06, + "loss": 0.5885, + "step": 9629 + }, + { + "epoch": 3.554885552547379, + "grad_norm": 0.9169951677322388, + "learning_rate": 1.8012706016463772e-06, + "loss": 0.588, + "step": 9630 + }, + { + "epoch": 3.555254737878415, + "grad_norm": 1.4128175973892212, + "learning_rate": 1.8008036229559067e-06, + "loss": 0.6204, + "step": 9631 + }, + { + "epoch": 3.5556239232094513, + "grad_norm": 0.9491999745368958, + "learning_rate": 1.8003366707297987e-06, + "loss": 0.623, + "step": 9632 + }, + { + "epoch": 3.5559931085404872, + "grad_norm": 0.9298705458641052, + "learning_rate": 1.7998697449857277e-06, + "loss": 0.5636, + "step": 9633 + }, + { + "epoch": 3.5563622938715236, + "grad_norm": 0.9301338195800781, + "learning_rate": 1.7994028457413653e-06, + "loss": 0.5831, + "step": 9634 + }, + { + "epoch": 3.5567314792025595, + "grad_norm": 0.946210503578186, + "learning_rate": 1.7989359730143845e-06, + "loss": 0.5951, + "step": 9635 + }, + { + "epoch": 3.557100664533596, + "grad_norm": 0.935107409954071, + "learning_rate": 1.7984691268224557e-06, + "loss": 0.532, + "step": 9636 + }, + { + "epoch": 3.557469849864632, + "grad_norm": 0.9644685983657837, + "learning_rate": 1.7980023071832482e-06, + "loss": 0.5902, + "step": 9637 + }, + { + "epoch": 3.557839035195668, + "grad_norm": 0.9264189600944519, + "learning_rate": 1.7975355141144335e-06, + "loss": 0.5863, + "step": 9638 + }, + { + "epoch": 3.5582082205267045, + "grad_norm": 0.9217782616615295, + "learning_rate": 1.7970687476336763e-06, + "loss": 0.5815, + "step": 9639 + }, + { + "epoch": 3.558577405857741, + "grad_norm": 0.955021321773529, + "learning_rate": 1.7966020077586458e-06, + "loss": 0.5992, + "step": 9640 + }, + { + "epoch": 3.5589465911887768, + "grad_norm": 0.9223068356513977, + "learning_rate": 1.7961352945070076e-06, + "loss": 0.585, + "step": 9641 + }, + { + "epoch": 3.5593157765198127, + "grad_norm": 0.9453595280647278, + "learning_rate": 1.7956686078964257e-06, + "loss": 0.5923, + "step": 9642 + }, + { + "epoch": 3.559684961850849, + "grad_norm": 0.9357618689537048, + "learning_rate": 1.7952019479445656e-06, + "loss": 0.5995, + "step": 9643 + }, + { + "epoch": 3.5600541471818854, + "grad_norm": 0.9215959310531616, + "learning_rate": 1.7947353146690888e-06, + "loss": 0.5962, + "step": 9644 + }, + { + "epoch": 3.5604233325129213, + "grad_norm": 0.9568697810173035, + "learning_rate": 1.794268708087657e-06, + "loss": 0.5916, + "step": 9645 + }, + { + "epoch": 3.5607925178439577, + "grad_norm": 0.9197744131088257, + "learning_rate": 1.7938021282179335e-06, + "loss": 0.5831, + "step": 9646 + }, + { + "epoch": 3.561161703174994, + "grad_norm": 0.9523927569389343, + "learning_rate": 1.7933355750775751e-06, + "loss": 0.5927, + "step": 9647 + }, + { + "epoch": 3.56153088850603, + "grad_norm": 0.9173294305801392, + "learning_rate": 1.7928690486842438e-06, + "loss": 0.6051, + "step": 9648 + }, + { + "epoch": 3.5619000738370663, + "grad_norm": 0.918562114238739, + "learning_rate": 1.7924025490555942e-06, + "loss": 0.6137, + "step": 9649 + }, + { + "epoch": 3.5622692591681027, + "grad_norm": 0.9371715188026428, + "learning_rate": 1.7919360762092853e-06, + "loss": 0.5899, + "step": 9650 + }, + { + "epoch": 3.5626384444991386, + "grad_norm": 0.9358276128768921, + "learning_rate": 1.7914696301629728e-06, + "loss": 0.5493, + "step": 9651 + }, + { + "epoch": 3.5630076298301745, + "grad_norm": 0.9521234035491943, + "learning_rate": 1.7910032109343107e-06, + "loss": 0.6021, + "step": 9652 + }, + { + "epoch": 3.563376815161211, + "grad_norm": 0.9266873002052307, + "learning_rate": 1.7905368185409538e-06, + "loss": 0.603, + "step": 9653 + }, + { + "epoch": 3.563746000492247, + "grad_norm": 0.9200506210327148, + "learning_rate": 1.790070453000554e-06, + "loss": 0.5966, + "step": 9654 + }, + { + "epoch": 3.564115185823283, + "grad_norm": 0.9228746294975281, + "learning_rate": 1.789604114330763e-06, + "loss": 0.6266, + "step": 9655 + }, + { + "epoch": 3.5644843711543195, + "grad_norm": 0.9450051784515381, + "learning_rate": 1.7891378025492336e-06, + "loss": 0.5922, + "step": 9656 + }, + { + "epoch": 3.564853556485356, + "grad_norm": 0.9484648704528809, + "learning_rate": 1.7886715176736125e-06, + "loss": 0.5747, + "step": 9657 + }, + { + "epoch": 3.5652227418163918, + "grad_norm": 0.9478430151939392, + "learning_rate": 1.7882052597215516e-06, + "loss": 0.5753, + "step": 9658 + }, + { + "epoch": 3.565591927147428, + "grad_norm": 0.9510417580604553, + "learning_rate": 1.7877390287106951e-06, + "loss": 0.6089, + "step": 9659 + }, + { + "epoch": 3.565961112478464, + "grad_norm": 0.9276120662689209, + "learning_rate": 1.7872728246586923e-06, + "loss": 0.5586, + "step": 9660 + }, + { + "epoch": 3.5663302978095004, + "grad_norm": 0.9333484172821045, + "learning_rate": 1.7868066475831886e-06, + "loss": 0.5889, + "step": 9661 + }, + { + "epoch": 3.5666994831405363, + "grad_norm": 0.924732506275177, + "learning_rate": 1.7863404975018278e-06, + "loss": 0.5978, + "step": 9662 + }, + { + "epoch": 3.5670686684715727, + "grad_norm": 0.9461908936500549, + "learning_rate": 1.7858743744322544e-06, + "loss": 0.5882, + "step": 9663 + }, + { + "epoch": 3.567437853802609, + "grad_norm": 0.913655161857605, + "learning_rate": 1.78540827839211e-06, + "loss": 0.6225, + "step": 9664 + }, + { + "epoch": 3.567807039133645, + "grad_norm": 0.9526889324188232, + "learning_rate": 1.7849422093990363e-06, + "loss": 0.5812, + "step": 9665 + }, + { + "epoch": 3.5681762244646813, + "grad_norm": 0.8995164036750793, + "learning_rate": 1.7844761674706757e-06, + "loss": 0.584, + "step": 9666 + }, + { + "epoch": 3.5685454097957177, + "grad_norm": 0.9190488457679749, + "learning_rate": 1.784010152624665e-06, + "loss": 0.5761, + "step": 9667 + }, + { + "epoch": 3.5689145951267536, + "grad_norm": 0.9506527781486511, + "learning_rate": 1.7835441648786456e-06, + "loss": 0.6137, + "step": 9668 + }, + { + "epoch": 3.56928378045779, + "grad_norm": 0.9227373600006104, + "learning_rate": 1.7830782042502515e-06, + "loss": 0.616, + "step": 9669 + }, + { + "epoch": 3.569652965788826, + "grad_norm": 0.9066543579101562, + "learning_rate": 1.782612270757122e-06, + "loss": 0.5963, + "step": 9670 + }, + { + "epoch": 3.570022151119862, + "grad_norm": 0.9235442876815796, + "learning_rate": 1.7821463644168917e-06, + "loss": 0.5955, + "step": 9671 + }, + { + "epoch": 3.570391336450898, + "grad_norm": 0.9326162338256836, + "learning_rate": 1.781680485247195e-06, + "loss": 0.5487, + "step": 9672 + }, + { + "epoch": 3.5707605217819345, + "grad_norm": 0.9431804418563843, + "learning_rate": 1.7812146332656656e-06, + "loss": 0.5932, + "step": 9673 + }, + { + "epoch": 3.571129707112971, + "grad_norm": 0.9176443815231323, + "learning_rate": 1.7807488084899348e-06, + "loss": 0.6193, + "step": 9674 + }, + { + "epoch": 3.5714988924440068, + "grad_norm": 0.8835244178771973, + "learning_rate": 1.780283010937634e-06, + "loss": 0.5769, + "step": 9675 + }, + { + "epoch": 3.571868077775043, + "grad_norm": 0.9246625900268555, + "learning_rate": 1.7798172406263959e-06, + "loss": 0.5461, + "step": 9676 + }, + { + "epoch": 3.5722372631060795, + "grad_norm": 0.923555850982666, + "learning_rate": 1.7793514975738464e-06, + "loss": 0.5703, + "step": 9677 + }, + { + "epoch": 3.5726064484371154, + "grad_norm": 0.9319658279418945, + "learning_rate": 1.7788857817976158e-06, + "loss": 0.6145, + "step": 9678 + }, + { + "epoch": 3.5729756337681517, + "grad_norm": 0.932144284248352, + "learning_rate": 1.7784200933153312e-06, + "loss": 0.6022, + "step": 9679 + }, + { + "epoch": 3.5733448190991877, + "grad_norm": 0.9595020413398743, + "learning_rate": 1.7779544321446176e-06, + "loss": 0.6151, + "step": 9680 + }, + { + "epoch": 3.573714004430224, + "grad_norm": 0.9076567888259888, + "learning_rate": 1.7774887983031016e-06, + "loss": 0.5535, + "step": 9681 + }, + { + "epoch": 3.57408318976126, + "grad_norm": 0.9150970578193665, + "learning_rate": 1.777023191808406e-06, + "loss": 0.5754, + "step": 9682 + }, + { + "epoch": 3.5744523750922963, + "grad_norm": 0.9514256119728088, + "learning_rate": 1.776557612678154e-06, + "loss": 0.6164, + "step": 9683 + }, + { + "epoch": 3.5748215604233327, + "grad_norm": 0.9439200162887573, + "learning_rate": 1.7760920609299696e-06, + "loss": 0.6, + "step": 9684 + }, + { + "epoch": 3.5751907457543686, + "grad_norm": 0.9137454628944397, + "learning_rate": 1.7756265365814707e-06, + "loss": 0.5914, + "step": 9685 + }, + { + "epoch": 3.575559931085405, + "grad_norm": 0.9394645690917969, + "learning_rate": 1.7751610396502799e-06, + "loss": 0.6145, + "step": 9686 + }, + { + "epoch": 3.5759291164164413, + "grad_norm": 0.9442930221557617, + "learning_rate": 1.7746955701540138e-06, + "loss": 0.6138, + "step": 9687 + }, + { + "epoch": 3.576298301747477, + "grad_norm": 0.919138491153717, + "learning_rate": 1.7742301281102919e-06, + "loss": 0.5785, + "step": 9688 + }, + { + "epoch": 3.5766674870785136, + "grad_norm": 0.9329934120178223, + "learning_rate": 1.7737647135367314e-06, + "loss": 0.5922, + "step": 9689 + }, + { + "epoch": 3.5770366724095495, + "grad_norm": 0.9321364164352417, + "learning_rate": 1.7732993264509463e-06, + "loss": 0.5822, + "step": 9690 + }, + { + "epoch": 3.577405857740586, + "grad_norm": 0.9268581867218018, + "learning_rate": 1.772833966870553e-06, + "loss": 0.585, + "step": 9691 + }, + { + "epoch": 3.5777750430716218, + "grad_norm": 0.9223912358283997, + "learning_rate": 1.772368634813164e-06, + "loss": 0.5907, + "step": 9692 + }, + { + "epoch": 3.578144228402658, + "grad_norm": 0.9474624395370483, + "learning_rate": 1.7719033302963922e-06, + "loss": 0.5937, + "step": 9693 + }, + { + "epoch": 3.5785134137336945, + "grad_norm": 0.9176208972930908, + "learning_rate": 1.7714380533378511e-06, + "loss": 0.5973, + "step": 9694 + }, + { + "epoch": 3.5788825990647304, + "grad_norm": 0.9061436653137207, + "learning_rate": 1.7709728039551482e-06, + "loss": 0.5902, + "step": 9695 + }, + { + "epoch": 3.5792517843957667, + "grad_norm": 0.8964852690696716, + "learning_rate": 1.770507582165896e-06, + "loss": 0.544, + "step": 9696 + }, + { + "epoch": 3.579620969726803, + "grad_norm": 0.932510495185852, + "learning_rate": 1.7700423879876998e-06, + "loss": 0.606, + "step": 9697 + }, + { + "epoch": 3.579990155057839, + "grad_norm": 0.9259088039398193, + "learning_rate": 1.7695772214381697e-06, + "loss": 0.5748, + "step": 9698 + }, + { + "epoch": 3.5803593403888754, + "grad_norm": 0.9126911759376526, + "learning_rate": 1.7691120825349118e-06, + "loss": 0.5665, + "step": 9699 + }, + { + "epoch": 3.5807285257199113, + "grad_norm": 0.9209457039833069, + "learning_rate": 1.7686469712955303e-06, + "loss": 0.5644, + "step": 9700 + }, + { + "epoch": 3.5810977110509477, + "grad_norm": 0.9376311302185059, + "learning_rate": 1.7681818877376305e-06, + "loss": 0.6162, + "step": 9701 + }, + { + "epoch": 3.5814668963819836, + "grad_norm": 0.9159733653068542, + "learning_rate": 1.767716831878815e-06, + "loss": 0.587, + "step": 9702 + }, + { + "epoch": 3.58183608171302, + "grad_norm": 0.9293885827064514, + "learning_rate": 1.767251803736686e-06, + "loss": 0.5833, + "step": 9703 + }, + { + "epoch": 3.5822052670440563, + "grad_norm": 0.9096619486808777, + "learning_rate": 1.766786803328846e-06, + "loss": 0.5848, + "step": 9704 + }, + { + "epoch": 3.582574452375092, + "grad_norm": 1.1036871671676636, + "learning_rate": 1.7663218306728926e-06, + "loss": 0.6123, + "step": 9705 + }, + { + "epoch": 3.5829436377061286, + "grad_norm": 0.905443549156189, + "learning_rate": 1.7658568857864283e-06, + "loss": 0.5722, + "step": 9706 + }, + { + "epoch": 3.583312823037165, + "grad_norm": 0.942611813545227, + "learning_rate": 1.7653919686870472e-06, + "loss": 0.6105, + "step": 9707 + }, + { + "epoch": 3.583682008368201, + "grad_norm": 0.9520750641822815, + "learning_rate": 1.764927079392349e-06, + "loss": 0.5954, + "step": 9708 + }, + { + "epoch": 3.5840511936992367, + "grad_norm": 0.9430752992630005, + "learning_rate": 1.764462217919929e-06, + "loss": 0.5827, + "step": 9709 + }, + { + "epoch": 3.584420379030273, + "grad_norm": 0.9360849857330322, + "learning_rate": 1.763997384287382e-06, + "loss": 0.5982, + "step": 9710 + }, + { + "epoch": 3.5847895643613095, + "grad_norm": 0.9249098896980286, + "learning_rate": 1.763532578512302e-06, + "loss": 0.5638, + "step": 9711 + }, + { + "epoch": 3.5851587496923454, + "grad_norm": 0.9236018657684326, + "learning_rate": 1.763067800612281e-06, + "loss": 0.5899, + "step": 9712 + }, + { + "epoch": 3.5855279350233817, + "grad_norm": 0.9048075675964355, + "learning_rate": 1.7626030506049102e-06, + "loss": 0.5606, + "step": 9713 + }, + { + "epoch": 3.585897120354418, + "grad_norm": 0.9455881714820862, + "learning_rate": 1.762138328507783e-06, + "loss": 0.562, + "step": 9714 + }, + { + "epoch": 3.586266305685454, + "grad_norm": 0.9507434368133545, + "learning_rate": 1.761673634338486e-06, + "loss": 0.6263, + "step": 9715 + }, + { + "epoch": 3.5866354910164904, + "grad_norm": 0.9045443534851074, + "learning_rate": 1.7612089681146087e-06, + "loss": 0.6157, + "step": 9716 + }, + { + "epoch": 3.5870046763475267, + "grad_norm": 0.961624264717102, + "learning_rate": 1.7607443298537403e-06, + "loss": 0.6378, + "step": 9717 + }, + { + "epoch": 3.5873738616785626, + "grad_norm": 0.9364482760429382, + "learning_rate": 1.7602797195734645e-06, + "loss": 0.6075, + "step": 9718 + }, + { + "epoch": 3.5877430470095986, + "grad_norm": 0.9038940668106079, + "learning_rate": 1.7598151372913686e-06, + "loss": 0.5197, + "step": 9719 + }, + { + "epoch": 3.588112232340635, + "grad_norm": 0.8987192511558533, + "learning_rate": 1.7593505830250354e-06, + "loss": 0.5732, + "step": 9720 + }, + { + "epoch": 3.5884814176716713, + "grad_norm": 0.9338714480400085, + "learning_rate": 1.7588860567920491e-06, + "loss": 0.6049, + "step": 9721 + }, + { + "epoch": 3.588850603002707, + "grad_norm": 0.9249098300933838, + "learning_rate": 1.758421558609992e-06, + "loss": 0.5878, + "step": 9722 + }, + { + "epoch": 3.5892197883337436, + "grad_norm": 0.9346422553062439, + "learning_rate": 1.7579570884964442e-06, + "loss": 0.6033, + "step": 9723 + }, + { + "epoch": 3.58958897366478, + "grad_norm": 0.9131539463996887, + "learning_rate": 1.7574926464689872e-06, + "loss": 0.6004, + "step": 9724 + }, + { + "epoch": 3.589958158995816, + "grad_norm": 0.9353798031806946, + "learning_rate": 1.7570282325451985e-06, + "loss": 0.6337, + "step": 9725 + }, + { + "epoch": 3.590327344326852, + "grad_norm": 0.916484534740448, + "learning_rate": 1.7565638467426569e-06, + "loss": 0.5944, + "step": 9726 + }, + { + "epoch": 3.590696529657888, + "grad_norm": 0.9375690221786499, + "learning_rate": 1.7560994890789396e-06, + "loss": 0.5991, + "step": 9727 + }, + { + "epoch": 3.5910657149889245, + "grad_norm": 0.9951784014701843, + "learning_rate": 1.7556351595716212e-06, + "loss": 0.5669, + "step": 9728 + }, + { + "epoch": 3.5914349003199604, + "grad_norm": 0.9353335499763489, + "learning_rate": 1.7551708582382777e-06, + "loss": 0.5854, + "step": 9729 + }, + { + "epoch": 3.5918040856509967, + "grad_norm": 0.9449027180671692, + "learning_rate": 1.7547065850964817e-06, + "loss": 0.6032, + "step": 9730 + }, + { + "epoch": 3.592173270982033, + "grad_norm": 0.9409614205360413, + "learning_rate": 1.754242340163806e-06, + "loss": 0.5756, + "step": 9731 + }, + { + "epoch": 3.592542456313069, + "grad_norm": 0.892133355140686, + "learning_rate": 1.753778123457823e-06, + "loss": 0.5351, + "step": 9732 + }, + { + "epoch": 3.5929116416441054, + "grad_norm": 0.9231703877449036, + "learning_rate": 1.7533139349961014e-06, + "loss": 0.589, + "step": 9733 + }, + { + "epoch": 3.5932808269751417, + "grad_norm": 0.979275643825531, + "learning_rate": 1.7528497747962136e-06, + "loss": 0.5955, + "step": 9734 + }, + { + "epoch": 3.5936500123061776, + "grad_norm": 0.9073494672775269, + "learning_rate": 1.752385642875724e-06, + "loss": 0.5572, + "step": 9735 + }, + { + "epoch": 3.594019197637214, + "grad_norm": 0.9235560894012451, + "learning_rate": 1.7519215392522026e-06, + "loss": 0.579, + "step": 9736 + }, + { + "epoch": 3.59438838296825, + "grad_norm": 0.9511833190917969, + "learning_rate": 1.7514574639432155e-06, + "loss": 0.5884, + "step": 9737 + }, + { + "epoch": 3.5947575682992863, + "grad_norm": 0.9348256587982178, + "learning_rate": 1.7509934169663268e-06, + "loss": 0.6393, + "step": 9738 + }, + { + "epoch": 3.595126753630322, + "grad_norm": 0.9672245979309082, + "learning_rate": 1.7505293983391009e-06, + "loss": 0.6161, + "step": 9739 + }, + { + "epoch": 3.5954959389613586, + "grad_norm": 0.9378015995025635, + "learning_rate": 1.7500654080791005e-06, + "loss": 0.579, + "step": 9740 + }, + { + "epoch": 3.595865124292395, + "grad_norm": 0.9055134057998657, + "learning_rate": 1.7496014462038873e-06, + "loss": 0.5726, + "step": 9741 + }, + { + "epoch": 3.596234309623431, + "grad_norm": 0.9246605038642883, + "learning_rate": 1.7491375127310238e-06, + "loss": 0.5514, + "step": 9742 + }, + { + "epoch": 3.596603494954467, + "grad_norm": 0.9400216937065125, + "learning_rate": 1.748673607678067e-06, + "loss": 0.5999, + "step": 9743 + }, + { + "epoch": 3.5969726802855035, + "grad_norm": 0.9425426125526428, + "learning_rate": 1.7482097310625785e-06, + "loss": 0.59, + "step": 9744 + }, + { + "epoch": 3.5973418656165395, + "grad_norm": 0.9404379725456238, + "learning_rate": 1.747745882902113e-06, + "loss": 0.5846, + "step": 9745 + }, + { + "epoch": 3.597711050947576, + "grad_norm": 0.9785301089286804, + "learning_rate": 1.747282063214229e-06, + "loss": 0.59, + "step": 9746 + }, + { + "epoch": 3.5980802362786117, + "grad_norm": 0.9229869246482849, + "learning_rate": 1.7468182720164817e-06, + "loss": 0.5911, + "step": 9747 + }, + { + "epoch": 3.598449421609648, + "grad_norm": 0.9247424006462097, + "learning_rate": 1.7463545093264244e-06, + "loss": 0.6319, + "step": 9748 + }, + { + "epoch": 3.598818606940684, + "grad_norm": 0.9532724618911743, + "learning_rate": 1.7458907751616117e-06, + "loss": 0.6266, + "step": 9749 + }, + { + "epoch": 3.5991877922717204, + "grad_norm": 0.9145004749298096, + "learning_rate": 1.7454270695395952e-06, + "loss": 0.5857, + "step": 9750 + }, + { + "epoch": 3.5995569776027567, + "grad_norm": 0.8867404460906982, + "learning_rate": 1.7449633924779252e-06, + "loss": 0.5639, + "step": 9751 + }, + { + "epoch": 3.5999261629337926, + "grad_norm": 0.8925729990005493, + "learning_rate": 1.7444997439941535e-06, + "loss": 0.5558, + "step": 9752 + }, + { + "epoch": 3.600295348264829, + "grad_norm": 0.9105004668235779, + "learning_rate": 1.7440361241058267e-06, + "loss": 0.5784, + "step": 9753 + }, + { + "epoch": 3.6006645335958654, + "grad_norm": 0.9038491249084473, + "learning_rate": 1.743572532830495e-06, + "loss": 0.5784, + "step": 9754 + }, + { + "epoch": 3.6010337189269013, + "grad_norm": 0.9115850329399109, + "learning_rate": 1.7431089701857046e-06, + "loss": 0.5916, + "step": 9755 + }, + { + "epoch": 3.6014029042579376, + "grad_norm": 0.9287083148956299, + "learning_rate": 1.7426454361889999e-06, + "loss": 0.6098, + "step": 9756 + }, + { + "epoch": 3.6017720895889735, + "grad_norm": 0.9044436812400818, + "learning_rate": 1.7421819308579272e-06, + "loss": 0.5469, + "step": 9757 + }, + { + "epoch": 3.60214127492001, + "grad_norm": 0.9213511943817139, + "learning_rate": 1.7417184542100284e-06, + "loss": 0.5997, + "step": 9758 + }, + { + "epoch": 3.602510460251046, + "grad_norm": 0.9663923978805542, + "learning_rate": 1.7412550062628475e-06, + "loss": 0.5894, + "step": 9759 + }, + { + "epoch": 3.602879645582082, + "grad_norm": 0.9329471588134766, + "learning_rate": 1.7407915870339252e-06, + "loss": 0.5886, + "step": 9760 + }, + { + "epoch": 3.6032488309131185, + "grad_norm": 0.9388481378555298, + "learning_rate": 1.7403281965408008e-06, + "loss": 0.6002, + "step": 9761 + }, + { + "epoch": 3.6036180162441545, + "grad_norm": 0.9110849499702454, + "learning_rate": 1.7398648348010155e-06, + "loss": 0.5502, + "step": 9762 + }, + { + "epoch": 3.603987201575191, + "grad_norm": 0.9157995581626892, + "learning_rate": 1.7394015018321053e-06, + "loss": 0.5967, + "step": 9763 + }, + { + "epoch": 3.604356386906227, + "grad_norm": 0.941422164440155, + "learning_rate": 1.7389381976516084e-06, + "loss": 0.577, + "step": 9764 + }, + { + "epoch": 3.604725572237263, + "grad_norm": 0.9855178594589233, + "learning_rate": 1.7384749222770613e-06, + "loss": 0.5961, + "step": 9765 + }, + { + "epoch": 3.6050947575682994, + "grad_norm": 0.9058412909507751, + "learning_rate": 1.7380116757259975e-06, + "loss": 0.5865, + "step": 9766 + }, + { + "epoch": 3.6054639428993354, + "grad_norm": 0.922373354434967, + "learning_rate": 1.7375484580159519e-06, + "loss": 0.5777, + "step": 9767 + }, + { + "epoch": 3.6058331282303717, + "grad_norm": 0.9108787775039673, + "learning_rate": 1.7370852691644558e-06, + "loss": 0.6051, + "step": 9768 + }, + { + "epoch": 3.6062023135614076, + "grad_norm": 0.9303464889526367, + "learning_rate": 1.736622109189041e-06, + "loss": 0.5828, + "step": 9769 + }, + { + "epoch": 3.606571498892444, + "grad_norm": 0.9546143412590027, + "learning_rate": 1.7361589781072397e-06, + "loss": 0.5826, + "step": 9770 + }, + { + "epoch": 3.6069406842234804, + "grad_norm": 0.9289159774780273, + "learning_rate": 1.7356958759365795e-06, + "loss": 0.5833, + "step": 9771 + }, + { + "epoch": 3.6073098695545163, + "grad_norm": 0.9508346319198608, + "learning_rate": 1.7352328026945892e-06, + "loss": 0.6215, + "step": 9772 + }, + { + "epoch": 3.6076790548855526, + "grad_norm": 0.9363271594047546, + "learning_rate": 1.7347697583987955e-06, + "loss": 0.5834, + "step": 9773 + }, + { + "epoch": 3.608048240216589, + "grad_norm": 0.9270887970924377, + "learning_rate": 1.7343067430667249e-06, + "loss": 0.5652, + "step": 9774 + }, + { + "epoch": 3.608417425547625, + "grad_norm": 0.9205818176269531, + "learning_rate": 1.7338437567159033e-06, + "loss": 0.607, + "step": 9775 + }, + { + "epoch": 3.608786610878661, + "grad_norm": 0.9414834976196289, + "learning_rate": 1.7333807993638529e-06, + "loss": 0.5955, + "step": 9776 + }, + { + "epoch": 3.609155796209697, + "grad_norm": 0.9223217964172363, + "learning_rate": 1.732917871028098e-06, + "loss": 0.5567, + "step": 9777 + }, + { + "epoch": 3.6095249815407335, + "grad_norm": 0.9270634651184082, + "learning_rate": 1.732454971726159e-06, + "loss": 0.6092, + "step": 9778 + }, + { + "epoch": 3.6098941668717694, + "grad_norm": 0.9339735507965088, + "learning_rate": 1.7319921014755573e-06, + "loss": 0.5948, + "step": 9779 + }, + { + "epoch": 3.610263352202806, + "grad_norm": 0.9566919803619385, + "learning_rate": 1.7315292602938126e-06, + "loss": 0.6042, + "step": 9780 + }, + { + "epoch": 3.610632537533842, + "grad_norm": 0.9111773371696472, + "learning_rate": 1.7310664481984421e-06, + "loss": 0.556, + "step": 9781 + }, + { + "epoch": 3.611001722864878, + "grad_norm": 0.9576960802078247, + "learning_rate": 1.7306036652069647e-06, + "loss": 0.6225, + "step": 9782 + }, + { + "epoch": 3.6113709081959144, + "grad_norm": 0.9075992703437805, + "learning_rate": 1.7301409113368945e-06, + "loss": 0.5964, + "step": 9783 + }, + { + "epoch": 3.611740093526951, + "grad_norm": 0.9260746240615845, + "learning_rate": 1.7296781866057482e-06, + "loss": 0.597, + "step": 9784 + }, + { + "epoch": 3.6121092788579867, + "grad_norm": 0.9320292472839355, + "learning_rate": 1.7292154910310402e-06, + "loss": 0.6132, + "step": 9785 + }, + { + "epoch": 3.6124784641890226, + "grad_norm": 0.9081462621688843, + "learning_rate": 1.7287528246302821e-06, + "loss": 0.5773, + "step": 9786 + }, + { + "epoch": 3.612847649520059, + "grad_norm": 0.9282170534133911, + "learning_rate": 1.7282901874209859e-06, + "loss": 0.5693, + "step": 9787 + }, + { + "epoch": 3.6132168348510953, + "grad_norm": 0.9061475992202759, + "learning_rate": 1.7278275794206634e-06, + "loss": 0.5643, + "step": 9788 + }, + { + "epoch": 3.6135860201821313, + "grad_norm": 0.9109857082366943, + "learning_rate": 1.7273650006468226e-06, + "loss": 0.5784, + "step": 9789 + }, + { + "epoch": 3.6139552055131676, + "grad_norm": 0.8924453258514404, + "learning_rate": 1.7269024511169732e-06, + "loss": 0.5913, + "step": 9790 + }, + { + "epoch": 3.614324390844204, + "grad_norm": 0.9149933457374573, + "learning_rate": 1.7264399308486219e-06, + "loss": 0.5675, + "step": 9791 + }, + { + "epoch": 3.61469357617524, + "grad_norm": 0.9370158910751343, + "learning_rate": 1.7259774398592737e-06, + "loss": 0.6341, + "step": 9792 + }, + { + "epoch": 3.6150627615062763, + "grad_norm": 0.9229863882064819, + "learning_rate": 1.725514978166437e-06, + "loss": 0.5852, + "step": 9793 + }, + { + "epoch": 3.615431946837312, + "grad_norm": 0.91949862241745, + "learning_rate": 1.7250525457876133e-06, + "loss": 0.5702, + "step": 9794 + }, + { + "epoch": 3.6158011321683485, + "grad_norm": 0.9186342358589172, + "learning_rate": 1.7245901427403062e-06, + "loss": 0.5468, + "step": 9795 + }, + { + "epoch": 3.6161703174993844, + "grad_norm": 0.8928649425506592, + "learning_rate": 1.7241277690420173e-06, + "loss": 0.5606, + "step": 9796 + }, + { + "epoch": 3.616539502830421, + "grad_norm": 0.9151570796966553, + "learning_rate": 1.7236654247102473e-06, + "loss": 0.5628, + "step": 9797 + }, + { + "epoch": 3.616908688161457, + "grad_norm": 0.9089704751968384, + "learning_rate": 1.7232031097624968e-06, + "loss": 0.5886, + "step": 9798 + }, + { + "epoch": 3.617277873492493, + "grad_norm": 0.9464991092681885, + "learning_rate": 1.7227408242162627e-06, + "loss": 0.5683, + "step": 9799 + }, + { + "epoch": 3.6176470588235294, + "grad_norm": 1.373986005783081, + "learning_rate": 1.7222785680890437e-06, + "loss": 0.5671, + "step": 9800 + }, + { + "epoch": 3.618016244154566, + "grad_norm": 0.9021572470664978, + "learning_rate": 1.7218163413983347e-06, + "loss": 0.5861, + "step": 9801 + }, + { + "epoch": 3.6183854294856017, + "grad_norm": 0.9487820267677307, + "learning_rate": 1.7213541441616313e-06, + "loss": 0.6076, + "step": 9802 + }, + { + "epoch": 3.618754614816638, + "grad_norm": 0.9271303415298462, + "learning_rate": 1.7208919763964288e-06, + "loss": 0.6087, + "step": 9803 + }, + { + "epoch": 3.619123800147674, + "grad_norm": 0.9155189394950867, + "learning_rate": 1.7204298381202184e-06, + "loss": 0.5959, + "step": 9804 + }, + { + "epoch": 3.6194929854787103, + "grad_norm": 0.9165118932723999, + "learning_rate": 1.7199677293504935e-06, + "loss": 0.5778, + "step": 9805 + }, + { + "epoch": 3.6198621708097463, + "grad_norm": 0.9162717461585999, + "learning_rate": 1.719505650104743e-06, + "loss": 0.553, + "step": 9806 + }, + { + "epoch": 3.6202313561407826, + "grad_norm": 0.9411643147468567, + "learning_rate": 1.7190436004004574e-06, + "loss": 0.6235, + "step": 9807 + }, + { + "epoch": 3.620600541471819, + "grad_norm": 0.9561032056808472, + "learning_rate": 1.7185815802551255e-06, + "loss": 0.6243, + "step": 9808 + }, + { + "epoch": 3.620969726802855, + "grad_norm": 0.9421554207801819, + "learning_rate": 1.7181195896862338e-06, + "loss": 0.5934, + "step": 9809 + }, + { + "epoch": 3.6213389121338913, + "grad_norm": 0.9577184915542603, + "learning_rate": 1.7176576287112696e-06, + "loss": 0.6204, + "step": 9810 + }, + { + "epoch": 3.6217080974649276, + "grad_norm": 0.9139274954795837, + "learning_rate": 1.7171956973477161e-06, + "loss": 0.5619, + "step": 9811 + }, + { + "epoch": 3.6220772827959635, + "grad_norm": 0.9113906621932983, + "learning_rate": 1.716733795613058e-06, + "loss": 0.5786, + "step": 9812 + }, + { + "epoch": 3.622446468127, + "grad_norm": 0.9374806880950928, + "learning_rate": 1.71627192352478e-06, + "loss": 0.5771, + "step": 9813 + }, + { + "epoch": 3.622815653458036, + "grad_norm": 0.9133288860321045, + "learning_rate": 1.7158100811003617e-06, + "loss": 0.554, + "step": 9814 + }, + { + "epoch": 3.623184838789072, + "grad_norm": 0.9445613026618958, + "learning_rate": 1.7153482683572847e-06, + "loss": 0.5889, + "step": 9815 + }, + { + "epoch": 3.623554024120108, + "grad_norm": 0.9233704805374146, + "learning_rate": 1.7148864853130277e-06, + "loss": 0.5694, + "step": 9816 + }, + { + "epoch": 3.6239232094511444, + "grad_norm": 0.9494632482528687, + "learning_rate": 1.7144247319850693e-06, + "loss": 0.5968, + "step": 9817 + }, + { + "epoch": 3.624292394782181, + "grad_norm": 1.0574979782104492, + "learning_rate": 1.7139630083908876e-06, + "loss": 0.5834, + "step": 9818 + }, + { + "epoch": 3.6246615801132167, + "grad_norm": 0.9911521673202515, + "learning_rate": 1.7135013145479574e-06, + "loss": 0.6174, + "step": 9819 + }, + { + "epoch": 3.625030765444253, + "grad_norm": 0.9288178086280823, + "learning_rate": 1.7130396504737546e-06, + "loss": 0.5764, + "step": 9820 + }, + { + "epoch": 3.6253999507752894, + "grad_norm": 0.9436420202255249, + "learning_rate": 1.7125780161857522e-06, + "loss": 0.6357, + "step": 9821 + }, + { + "epoch": 3.6257691361063253, + "grad_norm": 0.9469804763793945, + "learning_rate": 1.7121164117014227e-06, + "loss": 0.6144, + "step": 9822 + }, + { + "epoch": 3.6261383214373617, + "grad_norm": 0.9557605981826782, + "learning_rate": 1.7116548370382395e-06, + "loss": 0.5838, + "step": 9823 + }, + { + "epoch": 3.6265075067683976, + "grad_norm": 0.8984341621398926, + "learning_rate": 1.7111932922136715e-06, + "loss": 0.5822, + "step": 9824 + }, + { + "epoch": 3.626876692099434, + "grad_norm": 0.9155614376068115, + "learning_rate": 1.7107317772451887e-06, + "loss": 0.5569, + "step": 9825 + }, + { + "epoch": 3.62724587743047, + "grad_norm": 0.9214277267456055, + "learning_rate": 1.710270292150259e-06, + "loss": 0.5743, + "step": 9826 + }, + { + "epoch": 3.6276150627615062, + "grad_norm": 0.9291713833808899, + "learning_rate": 1.7098088369463495e-06, + "loss": 0.5993, + "step": 9827 + }, + { + "epoch": 3.6279842480925426, + "grad_norm": 0.9368574023246765, + "learning_rate": 1.7093474116509262e-06, + "loss": 0.5897, + "step": 9828 + }, + { + "epoch": 3.6283534334235785, + "grad_norm": 0.9861503839492798, + "learning_rate": 1.7088860162814535e-06, + "loss": 0.5973, + "step": 9829 + }, + { + "epoch": 3.628722618754615, + "grad_norm": 0.9518827199935913, + "learning_rate": 1.7084246508553947e-06, + "loss": 0.6424, + "step": 9830 + }, + { + "epoch": 3.6290918040856512, + "grad_norm": 0.9350191950798035, + "learning_rate": 1.7079633153902147e-06, + "loss": 0.5884, + "step": 9831 + }, + { + "epoch": 3.629460989416687, + "grad_norm": 0.9164958596229553, + "learning_rate": 1.7075020099033718e-06, + "loss": 0.591, + "step": 9832 + }, + { + "epoch": 3.6298301747477235, + "grad_norm": 0.9372456073760986, + "learning_rate": 1.7070407344123287e-06, + "loss": 0.5956, + "step": 9833 + }, + { + "epoch": 3.6301993600787594, + "grad_norm": 0.928970992565155, + "learning_rate": 1.7065794889345433e-06, + "loss": 0.6125, + "step": 9834 + }, + { + "epoch": 3.630568545409796, + "grad_norm": 0.9281225204467773, + "learning_rate": 1.7061182734874734e-06, + "loss": 0.591, + "step": 9835 + }, + { + "epoch": 3.6309377307408317, + "grad_norm": 0.9364939332008362, + "learning_rate": 1.705657088088577e-06, + "loss": 0.5869, + "step": 9836 + }, + { + "epoch": 3.631306916071868, + "grad_norm": 0.9324162602424622, + "learning_rate": 1.7051959327553087e-06, + "loss": 0.5965, + "step": 9837 + }, + { + "epoch": 3.6316761014029044, + "grad_norm": 0.9075167179107666, + "learning_rate": 1.7047348075051238e-06, + "loss": 0.5803, + "step": 9838 + }, + { + "epoch": 3.6320452867339403, + "grad_norm": 0.919216513633728, + "learning_rate": 1.704273712355475e-06, + "loss": 0.5766, + "step": 9839 + }, + { + "epoch": 3.6324144720649767, + "grad_norm": 0.9288250803947449, + "learning_rate": 1.7038126473238145e-06, + "loss": 0.5738, + "step": 9840 + }, + { + "epoch": 3.632783657396013, + "grad_norm": 0.9282596111297607, + "learning_rate": 1.7033516124275957e-06, + "loss": 0.602, + "step": 9841 + }, + { + "epoch": 3.633152842727049, + "grad_norm": 0.9053125977516174, + "learning_rate": 1.7028906076842651e-06, + "loss": 0.5662, + "step": 9842 + }, + { + "epoch": 3.633522028058085, + "grad_norm": 0.9252104163169861, + "learning_rate": 1.702429633111275e-06, + "loss": 0.5502, + "step": 9843 + }, + { + "epoch": 3.6338912133891212, + "grad_norm": 0.9513213038444519, + "learning_rate": 1.7019686887260706e-06, + "loss": 0.6061, + "step": 9844 + }, + { + "epoch": 3.6342603987201576, + "grad_norm": 0.9296239614486694, + "learning_rate": 1.7015077745460995e-06, + "loss": 0.5647, + "step": 9845 + }, + { + "epoch": 3.6346295840511935, + "grad_norm": 0.9201794266700745, + "learning_rate": 1.7010468905888077e-06, + "loss": 0.5669, + "step": 9846 + }, + { + "epoch": 3.63499876938223, + "grad_norm": 0.9203991293907166, + "learning_rate": 1.7005860368716386e-06, + "loss": 0.5835, + "step": 9847 + }, + { + "epoch": 3.6353679547132662, + "grad_norm": 0.9343230724334717, + "learning_rate": 1.7001252134120361e-06, + "loss": 0.6019, + "step": 9848 + }, + { + "epoch": 3.635737140044302, + "grad_norm": 0.9190361499786377, + "learning_rate": 1.6996644202274416e-06, + "loss": 0.5961, + "step": 9849 + }, + { + "epoch": 3.6361063253753385, + "grad_norm": 0.9315681457519531, + "learning_rate": 1.6992036573352948e-06, + "loss": 0.5954, + "step": 9850 + }, + { + "epoch": 3.636475510706375, + "grad_norm": 0.9364153146743774, + "learning_rate": 1.6987429247530394e-06, + "loss": 0.6001, + "step": 9851 + }, + { + "epoch": 3.636844696037411, + "grad_norm": 0.9718535542488098, + "learning_rate": 1.6982822224981094e-06, + "loss": 0.5973, + "step": 9852 + }, + { + "epoch": 3.6372138813684467, + "grad_norm": 0.9567764401435852, + "learning_rate": 1.6978215505879454e-06, + "loss": 0.6206, + "step": 9853 + }, + { + "epoch": 3.637583066699483, + "grad_norm": 0.9461734890937805, + "learning_rate": 1.6973609090399817e-06, + "loss": 0.6004, + "step": 9854 + }, + { + "epoch": 3.6379522520305194, + "grad_norm": 0.924358069896698, + "learning_rate": 1.6969002978716548e-06, + "loss": 0.5781, + "step": 9855 + }, + { + "epoch": 3.6383214373615553, + "grad_norm": 0.9427884817123413, + "learning_rate": 1.6964397171003983e-06, + "loss": 0.6127, + "step": 9856 + }, + { + "epoch": 3.6386906226925917, + "grad_norm": 0.9432247877120972, + "learning_rate": 1.6959791667436443e-06, + "loss": 0.5996, + "step": 9857 + }, + { + "epoch": 3.639059808023628, + "grad_norm": 0.9320195913314819, + "learning_rate": 1.6955186468188259e-06, + "loss": 0.586, + "step": 9858 + }, + { + "epoch": 3.639428993354664, + "grad_norm": 0.9466235041618347, + "learning_rate": 1.6950581573433721e-06, + "loss": 0.6211, + "step": 9859 + }, + { + "epoch": 3.6397981786857003, + "grad_norm": 0.9180346727371216, + "learning_rate": 1.6945976983347126e-06, + "loss": 0.6165, + "step": 9860 + }, + { + "epoch": 3.6401673640167362, + "grad_norm": 0.9351822733879089, + "learning_rate": 1.6941372698102776e-06, + "loss": 0.596, + "step": 9861 + }, + { + "epoch": 3.6405365493477726, + "grad_norm": 0.9234793782234192, + "learning_rate": 1.693676871787491e-06, + "loss": 0.6024, + "step": 9862 + }, + { + "epoch": 3.6409057346788085, + "grad_norm": 0.9182064533233643, + "learning_rate": 1.693216504283781e-06, + "loss": 0.5964, + "step": 9863 + }, + { + "epoch": 3.641274920009845, + "grad_norm": 0.9136356711387634, + "learning_rate": 1.692756167316572e-06, + "loss": 0.5925, + "step": 9864 + }, + { + "epoch": 3.6416441053408812, + "grad_norm": 0.9099173545837402, + "learning_rate": 1.6922958609032868e-06, + "loss": 0.5808, + "step": 9865 + }, + { + "epoch": 3.642013290671917, + "grad_norm": 0.9447537064552307, + "learning_rate": 1.6918355850613488e-06, + "loss": 0.582, + "step": 9866 + }, + { + "epoch": 3.6423824760029535, + "grad_norm": 1.0417275428771973, + "learning_rate": 1.6913753398081783e-06, + "loss": 0.6113, + "step": 9867 + }, + { + "epoch": 3.64275166133399, + "grad_norm": 0.9429894089698792, + "learning_rate": 1.6909151251611958e-06, + "loss": 0.5948, + "step": 9868 + }, + { + "epoch": 3.643120846665026, + "grad_norm": 0.9261227250099182, + "learning_rate": 1.6904549411378219e-06, + "loss": 0.5499, + "step": 9869 + }, + { + "epoch": 3.643490031996062, + "grad_norm": 0.9365057349205017, + "learning_rate": 1.6899947877554712e-06, + "loss": 0.5588, + "step": 9870 + }, + { + "epoch": 3.643859217327098, + "grad_norm": 0.9102270603179932, + "learning_rate": 1.6895346650315634e-06, + "loss": 0.5488, + "step": 9871 + }, + { + "epoch": 3.6442284026581344, + "grad_norm": 0.9312403202056885, + "learning_rate": 1.6890745729835123e-06, + "loss": 0.6072, + "step": 9872 + }, + { + "epoch": 3.6445975879891703, + "grad_norm": 0.92508864402771, + "learning_rate": 1.6886145116287323e-06, + "loss": 0.6094, + "step": 9873 + }, + { + "epoch": 3.6449667733202067, + "grad_norm": 0.9431111812591553, + "learning_rate": 1.688154480984638e-06, + "loss": 0.6231, + "step": 9874 + }, + { + "epoch": 3.645335958651243, + "grad_norm": 0.9256555438041687, + "learning_rate": 1.687694481068639e-06, + "loss": 0.5666, + "step": 9875 + }, + { + "epoch": 3.645705143982279, + "grad_norm": 0.938505232334137, + "learning_rate": 1.6872345118981487e-06, + "loss": 0.5956, + "step": 9876 + }, + { + "epoch": 3.6460743293133153, + "grad_norm": 0.944976806640625, + "learning_rate": 1.6867745734905749e-06, + "loss": 0.5864, + "step": 9877 + }, + { + "epoch": 3.6464435146443517, + "grad_norm": 0.9239941239356995, + "learning_rate": 1.6863146658633267e-06, + "loss": 0.5545, + "step": 9878 + }, + { + "epoch": 3.6468126999753876, + "grad_norm": 0.9337015151977539, + "learning_rate": 1.6858547890338127e-06, + "loss": 0.5809, + "step": 9879 + }, + { + "epoch": 3.647181885306424, + "grad_norm": 0.9159792065620422, + "learning_rate": 1.6853949430194366e-06, + "loss": 0.5569, + "step": 9880 + }, + { + "epoch": 3.64755107063746, + "grad_norm": 0.8854168057441711, + "learning_rate": 1.684935127837606e-06, + "loss": 0.5774, + "step": 9881 + }, + { + "epoch": 3.6479202559684962, + "grad_norm": 0.9230166077613831, + "learning_rate": 1.6844753435057227e-06, + "loss": 0.5486, + "step": 9882 + }, + { + "epoch": 3.648289441299532, + "grad_norm": 1.1997127532958984, + "learning_rate": 1.6840155900411907e-06, + "loss": 0.59, + "step": 9883 + }, + { + "epoch": 3.6486586266305685, + "grad_norm": 0.9195371270179749, + "learning_rate": 1.683555867461411e-06, + "loss": 0.5921, + "step": 9884 + }, + { + "epoch": 3.649027811961605, + "grad_norm": 0.9227511286735535, + "learning_rate": 1.6830961757837842e-06, + "loss": 0.5802, + "step": 9885 + }, + { + "epoch": 3.6493969972926408, + "grad_norm": 0.940788745880127, + "learning_rate": 1.6826365150257096e-06, + "loss": 0.575, + "step": 9886 + }, + { + "epoch": 3.649766182623677, + "grad_norm": 0.9528405070304871, + "learning_rate": 1.6821768852045842e-06, + "loss": 0.6177, + "step": 9887 + }, + { + "epoch": 3.6501353679547135, + "grad_norm": 0.9107630252838135, + "learning_rate": 1.6817172863378052e-06, + "loss": 0.5768, + "step": 9888 + }, + { + "epoch": 3.6505045532857494, + "grad_norm": 0.9182484149932861, + "learning_rate": 1.6812577184427702e-06, + "loss": 0.5946, + "step": 9889 + }, + { + "epoch": 3.6508737386167858, + "grad_norm": 0.9449949860572815, + "learning_rate": 1.6807981815368706e-06, + "loss": 0.5996, + "step": 9890 + }, + { + "epoch": 3.6512429239478217, + "grad_norm": 0.9099729061126709, + "learning_rate": 1.6803386756375022e-06, + "loss": 0.5968, + "step": 9891 + }, + { + "epoch": 3.651612109278858, + "grad_norm": 0.9125037789344788, + "learning_rate": 1.6798792007620562e-06, + "loss": 0.5821, + "step": 9892 + }, + { + "epoch": 3.651981294609894, + "grad_norm": 0.9145419001579285, + "learning_rate": 1.679419756927923e-06, + "loss": 0.5604, + "step": 9893 + }, + { + "epoch": 3.6523504799409303, + "grad_norm": 0.949316143989563, + "learning_rate": 1.678960344152494e-06, + "loss": 0.61, + "step": 9894 + }, + { + "epoch": 3.6527196652719667, + "grad_norm": 0.949395477771759, + "learning_rate": 1.6785009624531563e-06, + "loss": 0.5807, + "step": 9895 + }, + { + "epoch": 3.6530888506030026, + "grad_norm": 0.9245078563690186, + "learning_rate": 1.678041611847298e-06, + "loss": 0.562, + "step": 9896 + }, + { + "epoch": 3.653458035934039, + "grad_norm": 0.9444425702095032, + "learning_rate": 1.6775822923523053e-06, + "loss": 0.6018, + "step": 9897 + }, + { + "epoch": 3.6538272212650753, + "grad_norm": 0.9409300088882446, + "learning_rate": 1.6771230039855623e-06, + "loss": 0.5909, + "step": 9898 + }, + { + "epoch": 3.6541964065961112, + "grad_norm": 0.9130560755729675, + "learning_rate": 1.6766637467644559e-06, + "loss": 0.564, + "step": 9899 + }, + { + "epoch": 3.6545655919271476, + "grad_norm": 0.9381970167160034, + "learning_rate": 1.676204520706365e-06, + "loss": 0.591, + "step": 9900 + }, + { + "epoch": 3.6549347772581835, + "grad_norm": 0.9308100342750549, + "learning_rate": 1.6757453258286734e-06, + "loss": 0.5749, + "step": 9901 + }, + { + "epoch": 3.65530396258922, + "grad_norm": 0.911737322807312, + "learning_rate": 1.6752861621487621e-06, + "loss": 0.6025, + "step": 9902 + }, + { + "epoch": 3.6556731479202558, + "grad_norm": 0.9162423014640808, + "learning_rate": 1.674827029684008e-06, + "loss": 0.572, + "step": 9903 + }, + { + "epoch": 3.656042333251292, + "grad_norm": 0.9139289855957031, + "learning_rate": 1.6743679284517916e-06, + "loss": 0.567, + "step": 9904 + }, + { + "epoch": 3.6564115185823285, + "grad_norm": 0.9229161739349365, + "learning_rate": 1.6739088584694876e-06, + "loss": 0.6035, + "step": 9905 + }, + { + "epoch": 3.6567807039133644, + "grad_norm": 0.973821759223938, + "learning_rate": 1.6734498197544725e-06, + "loss": 0.5862, + "step": 9906 + }, + { + "epoch": 3.6571498892444008, + "grad_norm": 0.9868661165237427, + "learning_rate": 1.6729908123241219e-06, + "loss": 0.593, + "step": 9907 + }, + { + "epoch": 3.657519074575437, + "grad_norm": 0.9324049949645996, + "learning_rate": 1.6725318361958069e-06, + "loss": 0.5696, + "step": 9908 + }, + { + "epoch": 3.657888259906473, + "grad_norm": 0.9075131416320801, + "learning_rate": 1.6720728913869019e-06, + "loss": 0.5544, + "step": 9909 + }, + { + "epoch": 3.658257445237509, + "grad_norm": 0.9358451962471008, + "learning_rate": 1.6716139779147755e-06, + "loss": 0.61, + "step": 9910 + }, + { + "epoch": 3.6586266305685453, + "grad_norm": 0.9426688551902771, + "learning_rate": 1.6711550957967986e-06, + "loss": 0.5624, + "step": 9911 + }, + { + "epoch": 3.6589958158995817, + "grad_norm": 0.9631806015968323, + "learning_rate": 1.6706962450503408e-06, + "loss": 0.5616, + "step": 9912 + }, + { + "epoch": 3.6593650012306176, + "grad_norm": 0.9280183911323547, + "learning_rate": 1.6702374256927677e-06, + "loss": 0.5887, + "step": 9913 + }, + { + "epoch": 3.659734186561654, + "grad_norm": 0.9305524826049805, + "learning_rate": 1.6697786377414465e-06, + "loss": 0.5962, + "step": 9914 + }, + { + "epoch": 3.6601033718926903, + "grad_norm": 0.9321364164352417, + "learning_rate": 1.6693198812137418e-06, + "loss": 0.604, + "step": 9915 + }, + { + "epoch": 3.660472557223726, + "grad_norm": 0.9711712002754211, + "learning_rate": 1.6688611561270163e-06, + "loss": 0.5893, + "step": 9916 + }, + { + "epoch": 3.6608417425547626, + "grad_norm": 0.9144282937049866, + "learning_rate": 1.6684024624986356e-06, + "loss": 0.5505, + "step": 9917 + }, + { + "epoch": 3.661210927885799, + "grad_norm": 0.9518265724182129, + "learning_rate": 1.6679438003459575e-06, + "loss": 0.6062, + "step": 9918 + }, + { + "epoch": 3.661580113216835, + "grad_norm": 0.9428040385246277, + "learning_rate": 1.667485169686346e-06, + "loss": 0.59, + "step": 9919 + }, + { + "epoch": 3.6619492985478708, + "grad_norm": 0.9442830681800842, + "learning_rate": 1.667026570537156e-06, + "loss": 0.6173, + "step": 9920 + }, + { + "epoch": 3.662318483878907, + "grad_norm": 0.9074687361717224, + "learning_rate": 1.6665680029157483e-06, + "loss": 0.5863, + "step": 9921 + }, + { + "epoch": 3.6626876692099435, + "grad_norm": 0.9058710336685181, + "learning_rate": 1.666109466839479e-06, + "loss": 0.5634, + "step": 9922 + }, + { + "epoch": 3.6630568545409794, + "grad_norm": 0.9100863337516785, + "learning_rate": 1.6656509623257026e-06, + "loss": 0.5388, + "step": 9923 + }, + { + "epoch": 3.6634260398720158, + "grad_norm": 0.8779420256614685, + "learning_rate": 1.6651924893917745e-06, + "loss": 0.5646, + "step": 9924 + }, + { + "epoch": 3.663795225203052, + "grad_norm": 0.8852850198745728, + "learning_rate": 1.6647340480550467e-06, + "loss": 0.5243, + "step": 9925 + }, + { + "epoch": 3.664164410534088, + "grad_norm": 0.918687641620636, + "learning_rate": 1.6642756383328713e-06, + "loss": 0.6193, + "step": 9926 + }, + { + "epoch": 3.6645335958651244, + "grad_norm": 0.9462483525276184, + "learning_rate": 1.6638172602426004e-06, + "loss": 0.5965, + "step": 9927 + }, + { + "epoch": 3.6649027811961603, + "grad_norm": 0.9319770336151123, + "learning_rate": 1.663358913801581e-06, + "loss": 0.5925, + "step": 9928 + }, + { + "epoch": 3.6652719665271967, + "grad_norm": 0.930600643157959, + "learning_rate": 1.662900599027164e-06, + "loss": 0.5995, + "step": 9929 + }, + { + "epoch": 3.6656411518582326, + "grad_norm": 0.8908993601799011, + "learning_rate": 1.6624423159366937e-06, + "loss": 0.5725, + "step": 9930 + }, + { + "epoch": 3.666010337189269, + "grad_norm": 0.905830979347229, + "learning_rate": 1.6619840645475178e-06, + "loss": 0.5878, + "step": 9931 + }, + { + "epoch": 3.6663795225203053, + "grad_norm": 0.9133006930351257, + "learning_rate": 1.6615258448769813e-06, + "loss": 0.5834, + "step": 9932 + }, + { + "epoch": 3.666748707851341, + "grad_norm": 0.9671419262886047, + "learning_rate": 1.6610676569424265e-06, + "loss": 0.609, + "step": 9933 + }, + { + "epoch": 3.6671178931823776, + "grad_norm": 0.9491820335388184, + "learning_rate": 1.6606095007611952e-06, + "loss": 0.6057, + "step": 9934 + }, + { + "epoch": 3.667487078513414, + "grad_norm": 0.9056117534637451, + "learning_rate": 1.6601513763506314e-06, + "loss": 0.5627, + "step": 9935 + }, + { + "epoch": 3.66785626384445, + "grad_norm": 0.9183022379875183, + "learning_rate": 1.659693283728071e-06, + "loss": 0.6192, + "step": 9936 + }, + { + "epoch": 3.668225449175486, + "grad_norm": 0.9270435571670532, + "learning_rate": 1.6592352229108567e-06, + "loss": 0.5621, + "step": 9937 + }, + { + "epoch": 3.668594634506522, + "grad_norm": 0.9450932741165161, + "learning_rate": 1.6587771939163216e-06, + "loss": 0.5899, + "step": 9938 + }, + { + "epoch": 3.6689638198375585, + "grad_norm": 0.9675694704055786, + "learning_rate": 1.6583191967618052e-06, + "loss": 0.6152, + "step": 9939 + }, + { + "epoch": 3.6693330051685944, + "grad_norm": 0.9284085631370544, + "learning_rate": 1.657861231464642e-06, + "loss": 0.5563, + "step": 9940 + }, + { + "epoch": 3.6697021904996308, + "grad_norm": 0.9667186141014099, + "learning_rate": 1.6574032980421648e-06, + "loss": 0.5588, + "step": 9941 + }, + { + "epoch": 3.670071375830667, + "grad_norm": 0.9100980162620544, + "learning_rate": 1.6569453965117071e-06, + "loss": 0.5836, + "step": 9942 + }, + { + "epoch": 3.670440561161703, + "grad_norm": 0.9022422432899475, + "learning_rate": 1.6564875268906e-06, + "loss": 0.5643, + "step": 9943 + }, + { + "epoch": 3.6708097464927394, + "grad_norm": 0.9598621726036072, + "learning_rate": 1.656029689196173e-06, + "loss": 0.6043, + "step": 9944 + }, + { + "epoch": 3.6711789318237757, + "grad_norm": 0.944935142993927, + "learning_rate": 1.655571883445758e-06, + "loss": 0.5785, + "step": 9945 + }, + { + "epoch": 3.6715481171548117, + "grad_norm": 1.00563645362854, + "learning_rate": 1.6551141096566787e-06, + "loss": 0.5884, + "step": 9946 + }, + { + "epoch": 3.671917302485848, + "grad_norm": 0.9193044900894165, + "learning_rate": 1.6546563678462651e-06, + "loss": 0.5666, + "step": 9947 + }, + { + "epoch": 3.672286487816884, + "grad_norm": 0.9430197477340698, + "learning_rate": 1.6541986580318397e-06, + "loss": 0.6204, + "step": 9948 + }, + { + "epoch": 3.6726556731479203, + "grad_norm": 0.9502775073051453, + "learning_rate": 1.653740980230729e-06, + "loss": 0.5976, + "step": 9949 + }, + { + "epoch": 3.673024858478956, + "grad_norm": 0.9459640979766846, + "learning_rate": 1.6532833344602555e-06, + "loss": 0.5629, + "step": 9950 + }, + { + "epoch": 3.6733940438099926, + "grad_norm": 0.9391629099845886, + "learning_rate": 1.6528257207377399e-06, + "loss": 0.5771, + "step": 9951 + }, + { + "epoch": 3.673763229141029, + "grad_norm": 0.9326270222663879, + "learning_rate": 1.6523681390805041e-06, + "loss": 0.6198, + "step": 9952 + }, + { + "epoch": 3.674132414472065, + "grad_norm": 0.9230613708496094, + "learning_rate": 1.651910589505866e-06, + "loss": 0.5782, + "step": 9953 + }, + { + "epoch": 3.674501599803101, + "grad_norm": 0.9668764472007751, + "learning_rate": 1.651453072031144e-06, + "loss": 0.6147, + "step": 9954 + }, + { + "epoch": 3.6748707851341376, + "grad_norm": 0.9415921568870544, + "learning_rate": 1.6509955866736568e-06, + "loss": 0.5981, + "step": 9955 + }, + { + "epoch": 3.6752399704651735, + "grad_norm": 0.9240815043449402, + "learning_rate": 1.6505381334507175e-06, + "loss": 0.5737, + "step": 9956 + }, + { + "epoch": 3.67560915579621, + "grad_norm": 0.9100967645645142, + "learning_rate": 1.6500807123796431e-06, + "loss": 0.5625, + "step": 9957 + }, + { + "epoch": 3.6759783411272458, + "grad_norm": 0.9186044931411743, + "learning_rate": 1.6496233234777436e-06, + "loss": 0.6057, + "step": 9958 + }, + { + "epoch": 3.676347526458282, + "grad_norm": 0.9505192637443542, + "learning_rate": 1.649165966762334e-06, + "loss": 0.6143, + "step": 9959 + }, + { + "epoch": 3.676716711789318, + "grad_norm": 0.9137967824935913, + "learning_rate": 1.6487086422507243e-06, + "loss": 0.5274, + "step": 9960 + }, + { + "epoch": 3.6770858971203544, + "grad_norm": 0.9055778384208679, + "learning_rate": 1.6482513499602234e-06, + "loss": 0.5827, + "step": 9961 + }, + { + "epoch": 3.6774550824513907, + "grad_norm": 0.963045597076416, + "learning_rate": 1.6477940899081406e-06, + "loss": 0.5973, + "step": 9962 + }, + { + "epoch": 3.6778242677824267, + "grad_norm": 0.927352786064148, + "learning_rate": 1.6473368621117819e-06, + "loss": 0.5761, + "step": 9963 + }, + { + "epoch": 3.678193453113463, + "grad_norm": 0.9336215257644653, + "learning_rate": 1.6468796665884535e-06, + "loss": 0.5897, + "step": 9964 + }, + { + "epoch": 3.6785626384444994, + "grad_norm": 0.9357629418373108, + "learning_rate": 1.646422503355462e-06, + "loss": 0.5893, + "step": 9965 + }, + { + "epoch": 3.6789318237755353, + "grad_norm": 0.9391396045684814, + "learning_rate": 1.6459653724301078e-06, + "loss": 0.5827, + "step": 9966 + }, + { + "epoch": 3.6793010091065717, + "grad_norm": 0.9395307302474976, + "learning_rate": 1.6455082738296963e-06, + "loss": 0.5957, + "step": 9967 + }, + { + "epoch": 3.6796701944376076, + "grad_norm": 0.9205470085144043, + "learning_rate": 1.6450512075715252e-06, + "loss": 0.6008, + "step": 9968 + }, + { + "epoch": 3.680039379768644, + "grad_norm": 0.9051299691200256, + "learning_rate": 1.6445941736728966e-06, + "loss": 0.5581, + "step": 9969 + }, + { + "epoch": 3.68040856509968, + "grad_norm": 0.9095370769500732, + "learning_rate": 1.6441371721511092e-06, + "loss": 0.521, + "step": 9970 + }, + { + "epoch": 3.680777750430716, + "grad_norm": 0.9224794507026672, + "learning_rate": 1.6436802030234593e-06, + "loss": 0.5725, + "step": 9971 + }, + { + "epoch": 3.6811469357617526, + "grad_norm": 0.9270541667938232, + "learning_rate": 1.6432232663072433e-06, + "loss": 0.601, + "step": 9972 + }, + { + "epoch": 3.6815161210927885, + "grad_norm": 0.9387547373771667, + "learning_rate": 1.6427663620197567e-06, + "loss": 0.6266, + "step": 9973 + }, + { + "epoch": 3.681885306423825, + "grad_norm": 0.9289405345916748, + "learning_rate": 1.642309490178292e-06, + "loss": 0.5752, + "step": 9974 + }, + { + "epoch": 3.682254491754861, + "grad_norm": 0.9211804270744324, + "learning_rate": 1.6418526508001436e-06, + "loss": 0.6045, + "step": 9975 + }, + { + "epoch": 3.682623677085897, + "grad_norm": 0.9249197840690613, + "learning_rate": 1.6413958439026001e-06, + "loss": 0.5797, + "step": 9976 + }, + { + "epoch": 3.6829928624169335, + "grad_norm": 0.946011483669281, + "learning_rate": 1.6409390695029538e-06, + "loss": 0.6226, + "step": 9977 + }, + { + "epoch": 3.6833620477479694, + "grad_norm": 0.9219370484352112, + "learning_rate": 1.6404823276184928e-06, + "loss": 0.5718, + "step": 9978 + }, + { + "epoch": 3.6837312330790057, + "grad_norm": 0.9232885837554932, + "learning_rate": 1.6400256182665038e-06, + "loss": 0.5852, + "step": 9979 + }, + { + "epoch": 3.6841004184100417, + "grad_norm": 0.9279897212982178, + "learning_rate": 1.6395689414642747e-06, + "loss": 0.5828, + "step": 9980 + }, + { + "epoch": 3.684469603741078, + "grad_norm": 0.898628830909729, + "learning_rate": 1.6391122972290885e-06, + "loss": 0.602, + "step": 9981 + }, + { + "epoch": 3.6848387890721144, + "grad_norm": 0.9124892354011536, + "learning_rate": 1.6386556855782304e-06, + "loss": 0.5672, + "step": 9982 + }, + { + "epoch": 3.6852079744031503, + "grad_norm": 0.9479355216026306, + "learning_rate": 1.6381991065289835e-06, + "loss": 0.6219, + "step": 9983 + }, + { + "epoch": 3.6855771597341866, + "grad_norm": 0.9266003370285034, + "learning_rate": 1.637742560098627e-06, + "loss": 0.5772, + "step": 9984 + }, + { + "epoch": 3.685946345065223, + "grad_norm": 0.9095280170440674, + "learning_rate": 1.6372860463044444e-06, + "loss": 0.5921, + "step": 9985 + }, + { + "epoch": 3.686315530396259, + "grad_norm": 0.9173744916915894, + "learning_rate": 1.636829565163711e-06, + "loss": 0.5743, + "step": 9986 + }, + { + "epoch": 3.686684715727295, + "grad_norm": 0.9175346493721008, + "learning_rate": 1.6363731166937067e-06, + "loss": 0.6431, + "step": 9987 + }, + { + "epoch": 3.687053901058331, + "grad_norm": 0.9140634536743164, + "learning_rate": 1.6359167009117078e-06, + "loss": 0.6108, + "step": 9988 + }, + { + "epoch": 3.6874230863893676, + "grad_norm": 0.8954952955245972, + "learning_rate": 1.6354603178349883e-06, + "loss": 0.5789, + "step": 9989 + }, + { + "epoch": 3.6877922717204035, + "grad_norm": 0.9598110318183899, + "learning_rate": 1.6350039674808239e-06, + "loss": 0.6198, + "step": 9990 + }, + { + "epoch": 3.68816145705144, + "grad_norm": 0.9097958207130432, + "learning_rate": 1.6345476498664852e-06, + "loss": 0.5779, + "step": 9991 + }, + { + "epoch": 3.688530642382476, + "grad_norm": 0.9148048162460327, + "learning_rate": 1.6340913650092454e-06, + "loss": 0.5677, + "step": 9992 + }, + { + "epoch": 3.688899827713512, + "grad_norm": 0.9531578421592712, + "learning_rate": 1.6336351129263745e-06, + "loss": 0.5845, + "step": 9993 + }, + { + "epoch": 3.6892690130445485, + "grad_norm": 0.9169396758079529, + "learning_rate": 1.6331788936351401e-06, + "loss": 0.5935, + "step": 9994 + }, + { + "epoch": 3.6896381983755844, + "grad_norm": 0.9280105233192444, + "learning_rate": 1.6327227071528125e-06, + "loss": 0.5802, + "step": 9995 + }, + { + "epoch": 3.6900073837066207, + "grad_norm": 0.9117137789726257, + "learning_rate": 1.6322665534966554e-06, + "loss": 0.5566, + "step": 9996 + }, + { + "epoch": 3.6903765690376567, + "grad_norm": 0.8951106667518616, + "learning_rate": 1.6318104326839356e-06, + "loss": 0.5592, + "step": 9997 + }, + { + "epoch": 3.690745754368693, + "grad_norm": 0.9385154843330383, + "learning_rate": 1.6313543447319175e-06, + "loss": 0.5815, + "step": 9998 + }, + { + "epoch": 3.6911149396997294, + "grad_norm": 0.9409191012382507, + "learning_rate": 1.630898289657863e-06, + "loss": 0.5812, + "step": 9999 + }, + { + "epoch": 3.6914841250307653, + "grad_norm": 0.9958325028419495, + "learning_rate": 1.630442267479034e-06, + "loss": 0.626, + "step": 10000 + }, + { + "epoch": 3.6918533103618016, + "grad_norm": 0.9482781887054443, + "learning_rate": 1.6299862782126907e-06, + "loss": 0.626, + "step": 10001 + }, + { + "epoch": 3.692222495692838, + "grad_norm": 0.9452361464500427, + "learning_rate": 1.629530321876092e-06, + "loss": 0.6173, + "step": 10002 + }, + { + "epoch": 3.692591681023874, + "grad_norm": 0.9093999862670898, + "learning_rate": 1.6290743984864964e-06, + "loss": 0.6084, + "step": 10003 + }, + { + "epoch": 3.6929608663549103, + "grad_norm": 0.946125864982605, + "learning_rate": 1.628618508061159e-06, + "loss": 0.5939, + "step": 10004 + }, + { + "epoch": 3.693330051685946, + "grad_norm": 0.9222347736358643, + "learning_rate": 1.6281626506173375e-06, + "loss": 0.569, + "step": 10005 + }, + { + "epoch": 3.6936992370169826, + "grad_norm": 0.9147573113441467, + "learning_rate": 1.6277068261722828e-06, + "loss": 0.5906, + "step": 10006 + }, + { + "epoch": 3.6940684223480185, + "grad_norm": 0.9287211894989014, + "learning_rate": 1.6272510347432501e-06, + "loss": 0.6008, + "step": 10007 + }, + { + "epoch": 3.694437607679055, + "grad_norm": 0.9258946180343628, + "learning_rate": 1.6267952763474909e-06, + "loss": 0.5967, + "step": 10008 + }, + { + "epoch": 3.694806793010091, + "grad_norm": 0.9078675508499146, + "learning_rate": 1.6263395510022546e-06, + "loss": 0.5723, + "step": 10009 + }, + { + "epoch": 3.695175978341127, + "grad_norm": 0.9086678624153137, + "learning_rate": 1.6258838587247899e-06, + "loss": 0.595, + "step": 10010 + }, + { + "epoch": 3.6955451636721635, + "grad_norm": 0.9331737756729126, + "learning_rate": 1.6254281995323465e-06, + "loss": 0.5742, + "step": 10011 + }, + { + "epoch": 3.6959143490032, + "grad_norm": 0.9111393094062805, + "learning_rate": 1.624972573442169e-06, + "loss": 0.598, + "step": 10012 + }, + { + "epoch": 3.6962835343342357, + "grad_norm": 0.924949049949646, + "learning_rate": 1.624516980471504e-06, + "loss": 0.5824, + "step": 10013 + }, + { + "epoch": 3.696652719665272, + "grad_norm": 0.9353187084197998, + "learning_rate": 1.6240614206375935e-06, + "loss": 0.597, + "step": 10014 + }, + { + "epoch": 3.697021904996308, + "grad_norm": 0.9206618666648865, + "learning_rate": 1.6236058939576827e-06, + "loss": 0.625, + "step": 10015 + }, + { + "epoch": 3.6973910903273444, + "grad_norm": 0.9226539731025696, + "learning_rate": 1.6231504004490126e-06, + "loss": 0.6225, + "step": 10016 + }, + { + "epoch": 3.6977602756583803, + "grad_norm": 0.9050605893135071, + "learning_rate": 1.6226949401288225e-06, + "loss": 0.5728, + "step": 10017 + }, + { + "epoch": 3.6981294609894166, + "grad_norm": 0.9392155408859253, + "learning_rate": 1.6222395130143525e-06, + "loss": 0.5838, + "step": 10018 + }, + { + "epoch": 3.698498646320453, + "grad_norm": 0.9392346143722534, + "learning_rate": 1.6217841191228399e-06, + "loss": 0.6399, + "step": 10019 + }, + { + "epoch": 3.698867831651489, + "grad_norm": 0.917790949344635, + "learning_rate": 1.621328758471521e-06, + "loss": 0.5905, + "step": 10020 + }, + { + "epoch": 3.6992370169825253, + "grad_norm": 0.9263393878936768, + "learning_rate": 1.6208734310776313e-06, + "loss": 0.6222, + "step": 10021 + }, + { + "epoch": 3.6996062023135616, + "grad_norm": 0.9033104777336121, + "learning_rate": 1.620418136958405e-06, + "loss": 0.5802, + "step": 10022 + }, + { + "epoch": 3.6999753876445975, + "grad_norm": 0.904790997505188, + "learning_rate": 1.6199628761310745e-06, + "loss": 0.5882, + "step": 10023 + }, + { + "epoch": 3.700344572975634, + "grad_norm": 0.9341480731964111, + "learning_rate": 1.6195076486128708e-06, + "loss": 0.6183, + "step": 10024 + }, + { + "epoch": 3.70071375830667, + "grad_norm": 0.9141009449958801, + "learning_rate": 1.6190524544210251e-06, + "loss": 0.5433, + "step": 10025 + }, + { + "epoch": 3.701082943637706, + "grad_norm": 0.9352914094924927, + "learning_rate": 1.618597293572766e-06, + "loss": 0.6027, + "step": 10026 + }, + { + "epoch": 3.701452128968742, + "grad_norm": 0.9116902947425842, + "learning_rate": 1.6181421660853213e-06, + "loss": 0.5532, + "step": 10027 + }, + { + "epoch": 3.7018213142997785, + "grad_norm": 0.9406532049179077, + "learning_rate": 1.6176870719759177e-06, + "loss": 0.5866, + "step": 10028 + }, + { + "epoch": 3.702190499630815, + "grad_norm": 0.9074361324310303, + "learning_rate": 1.6172320112617795e-06, + "loss": 0.6052, + "step": 10029 + }, + { + "epoch": 3.7025596849618507, + "grad_norm": 0.9407531023025513, + "learning_rate": 1.6167769839601309e-06, + "loss": 0.5886, + "step": 10030 + }, + { + "epoch": 3.702928870292887, + "grad_norm": 0.9415310621261597, + "learning_rate": 1.6163219900881952e-06, + "loss": 0.6001, + "step": 10031 + }, + { + "epoch": 3.7032980556239234, + "grad_norm": 0.955744743347168, + "learning_rate": 1.615867029663193e-06, + "loss": 0.5879, + "step": 10032 + }, + { + "epoch": 3.7036672409549594, + "grad_norm": 0.9282245635986328, + "learning_rate": 1.6154121027023448e-06, + "loss": 0.5698, + "step": 10033 + }, + { + "epoch": 3.7040364262859957, + "grad_norm": 0.9133482575416565, + "learning_rate": 1.6149572092228684e-06, + "loss": 0.5982, + "step": 10034 + }, + { + "epoch": 3.7044056116170316, + "grad_norm": 0.9466165900230408, + "learning_rate": 1.614502349241983e-06, + "loss": 0.6104, + "step": 10035 + }, + { + "epoch": 3.704774796948068, + "grad_norm": 0.9089131355285645, + "learning_rate": 1.6140475227769047e-06, + "loss": 0.5686, + "step": 10036 + }, + { + "epoch": 3.705143982279104, + "grad_norm": 0.9333856105804443, + "learning_rate": 1.6135927298448475e-06, + "loss": 0.6039, + "step": 10037 + }, + { + "epoch": 3.7055131676101403, + "grad_norm": 0.9207614064216614, + "learning_rate": 1.6131379704630263e-06, + "loss": 0.6012, + "step": 10038 + }, + { + "epoch": 3.7058823529411766, + "grad_norm": 0.9283604025840759, + "learning_rate": 1.6126832446486524e-06, + "loss": 0.5917, + "step": 10039 + }, + { + "epoch": 3.7062515382722125, + "grad_norm": 0.952979326248169, + "learning_rate": 1.6122285524189376e-06, + "loss": 0.5699, + "step": 10040 + }, + { + "epoch": 3.706620723603249, + "grad_norm": 0.9407802820205688, + "learning_rate": 1.6117738937910926e-06, + "loss": 0.6213, + "step": 10041 + }, + { + "epoch": 3.7069899089342853, + "grad_norm": 0.9064960479736328, + "learning_rate": 1.6113192687823249e-06, + "loss": 0.5487, + "step": 10042 + }, + { + "epoch": 3.707359094265321, + "grad_norm": 0.9035694599151611, + "learning_rate": 1.6108646774098428e-06, + "loss": 0.5577, + "step": 10043 + }, + { + "epoch": 3.7077282795963575, + "grad_norm": 0.9685150980949402, + "learning_rate": 1.6104101196908511e-06, + "loss": 0.5872, + "step": 10044 + }, + { + "epoch": 3.7080974649273934, + "grad_norm": 0.9122081995010376, + "learning_rate": 1.609955595642556e-06, + "loss": 0.6003, + "step": 10045 + }, + { + "epoch": 3.70846665025843, + "grad_norm": 0.9514978528022766, + "learning_rate": 1.6095011052821614e-06, + "loss": 0.6285, + "step": 10046 + }, + { + "epoch": 3.7088358355894657, + "grad_norm": 0.9503528475761414, + "learning_rate": 1.609046648626868e-06, + "loss": 0.6315, + "step": 10047 + }, + { + "epoch": 3.709205020920502, + "grad_norm": 0.9107522368431091, + "learning_rate": 1.608592225693878e-06, + "loss": 0.5542, + "step": 10048 + }, + { + "epoch": 3.7095742062515384, + "grad_norm": 0.9436078667640686, + "learning_rate": 1.6081378365003916e-06, + "loss": 0.6066, + "step": 10049 + }, + { + "epoch": 3.7099433915825744, + "grad_norm": 0.9554114937782288, + "learning_rate": 1.607683481063606e-06, + "loss": 0.6014, + "step": 10050 + }, + { + "epoch": 3.7103125769136107, + "grad_norm": 0.9186397790908813, + "learning_rate": 1.6072291594007195e-06, + "loss": 0.6114, + "step": 10051 + }, + { + "epoch": 3.710681762244647, + "grad_norm": 0.9022214412689209, + "learning_rate": 1.6067748715289272e-06, + "loss": 0.5847, + "step": 10052 + }, + { + "epoch": 3.711050947575683, + "grad_norm": 0.9055942893028259, + "learning_rate": 1.6063206174654233e-06, + "loss": 0.5778, + "step": 10053 + }, + { + "epoch": 3.711420132906719, + "grad_norm": 0.921855092048645, + "learning_rate": 1.605866397227403e-06, + "loss": 0.6002, + "step": 10054 + }, + { + "epoch": 3.7117893182377553, + "grad_norm": 0.9383158683776855, + "learning_rate": 1.6054122108320571e-06, + "loss": 0.629, + "step": 10055 + }, + { + "epoch": 3.7121585035687916, + "grad_norm": 0.9320182204246521, + "learning_rate": 1.6049580582965774e-06, + "loss": 0.6116, + "step": 10056 + }, + { + "epoch": 3.7125276888998275, + "grad_norm": 0.9559786319732666, + "learning_rate": 1.6045039396381517e-06, + "loss": 0.62, + "step": 10057 + }, + { + "epoch": 3.712896874230864, + "grad_norm": 0.915987491607666, + "learning_rate": 1.6040498548739695e-06, + "loss": 0.655, + "step": 10058 + }, + { + "epoch": 3.7132660595619003, + "grad_norm": 0.9306105375289917, + "learning_rate": 1.603595804021218e-06, + "loss": 0.5782, + "step": 10059 + }, + { + "epoch": 3.713635244892936, + "grad_norm": 0.9567102789878845, + "learning_rate": 1.6031417870970816e-06, + "loss": 0.619, + "step": 10060 + }, + { + "epoch": 3.7140044302239725, + "grad_norm": 0.921573281288147, + "learning_rate": 1.6026878041187465e-06, + "loss": 0.5586, + "step": 10061 + }, + { + "epoch": 3.714373615555009, + "grad_norm": 0.9446501731872559, + "learning_rate": 1.6022338551033941e-06, + "loss": 0.5684, + "step": 10062 + }, + { + "epoch": 3.714742800886045, + "grad_norm": 0.9159225821495056, + "learning_rate": 1.6017799400682066e-06, + "loss": 0.5851, + "step": 10063 + }, + { + "epoch": 3.7151119862170807, + "grad_norm": 0.9295268654823303, + "learning_rate": 1.601326059030366e-06, + "loss": 0.5657, + "step": 10064 + }, + { + "epoch": 3.715481171548117, + "grad_norm": 0.9611098170280457, + "learning_rate": 1.6008722120070496e-06, + "loss": 0.6149, + "step": 10065 + }, + { + "epoch": 3.7158503568791534, + "grad_norm": 0.9593212008476257, + "learning_rate": 1.6004183990154369e-06, + "loss": 0.5545, + "step": 10066 + }, + { + "epoch": 3.7162195422101894, + "grad_norm": 0.9260545969009399, + "learning_rate": 1.599964620072703e-06, + "loss": 0.5891, + "step": 10067 + }, + { + "epoch": 3.7165887275412257, + "grad_norm": 0.9524555802345276, + "learning_rate": 1.5995108751960249e-06, + "loss": 0.5761, + "step": 10068 + }, + { + "epoch": 3.716957912872262, + "grad_norm": 0.9053049683570862, + "learning_rate": 1.5990571644025756e-06, + "loss": 0.5646, + "step": 10069 + }, + { + "epoch": 3.717327098203298, + "grad_norm": 0.9110679030418396, + "learning_rate": 1.5986034877095285e-06, + "loss": 0.5965, + "step": 10070 + }, + { + "epoch": 3.7176962835343343, + "grad_norm": 0.9261571764945984, + "learning_rate": 1.598149845134055e-06, + "loss": 0.59, + "step": 10071 + }, + { + "epoch": 3.7180654688653703, + "grad_norm": 0.9241535067558289, + "learning_rate": 1.597696236693325e-06, + "loss": 0.583, + "step": 10072 + }, + { + "epoch": 3.7184346541964066, + "grad_norm": 0.9278937578201294, + "learning_rate": 1.5972426624045068e-06, + "loss": 0.5749, + "step": 10073 + }, + { + "epoch": 3.7188038395274425, + "grad_norm": 0.9022297859191895, + "learning_rate": 1.5967891222847704e-06, + "loss": 0.5786, + "step": 10074 + }, + { + "epoch": 3.719173024858479, + "grad_norm": 0.9166123867034912, + "learning_rate": 1.5963356163512797e-06, + "loss": 0.6065, + "step": 10075 + }, + { + "epoch": 3.7195422101895153, + "grad_norm": 0.922938346862793, + "learning_rate": 1.5958821446212013e-06, + "loss": 0.6221, + "step": 10076 + }, + { + "epoch": 3.719911395520551, + "grad_norm": 0.9280552864074707, + "learning_rate": 1.595428707111698e-06, + "loss": 0.5496, + "step": 10077 + }, + { + "epoch": 3.7202805808515875, + "grad_norm": 0.9117466807365417, + "learning_rate": 1.594975303839933e-06, + "loss": 0.5623, + "step": 10078 + }, + { + "epoch": 3.720649766182624, + "grad_norm": 0.9092711210250854, + "learning_rate": 1.5945219348230673e-06, + "loss": 0.5691, + "step": 10079 + }, + { + "epoch": 3.72101895151366, + "grad_norm": 0.9453006386756897, + "learning_rate": 1.59406860007826e-06, + "loss": 0.6097, + "step": 10080 + }, + { + "epoch": 3.721388136844696, + "grad_norm": 0.9390314221382141, + "learning_rate": 1.5936152996226709e-06, + "loss": 0.5661, + "step": 10081 + }, + { + "epoch": 3.721757322175732, + "grad_norm": 0.9710476994514465, + "learning_rate": 1.593162033473456e-06, + "loss": 0.6191, + "step": 10082 + }, + { + "epoch": 3.7221265075067684, + "grad_norm": 0.9354081749916077, + "learning_rate": 1.5927088016477715e-06, + "loss": 0.5926, + "step": 10083 + }, + { + "epoch": 3.7224956928378043, + "grad_norm": 0.8937820792198181, + "learning_rate": 1.5922556041627736e-06, + "loss": 0.5753, + "step": 10084 + }, + { + "epoch": 3.7228648781688407, + "grad_norm": 0.9487779140472412, + "learning_rate": 1.5918024410356141e-06, + "loss": 0.5953, + "step": 10085 + }, + { + "epoch": 3.723234063499877, + "grad_norm": 0.9275710582733154, + "learning_rate": 1.5913493122834457e-06, + "loss": 0.5932, + "step": 10086 + }, + { + "epoch": 3.723603248830913, + "grad_norm": 0.9272720813751221, + "learning_rate": 1.590896217923419e-06, + "loss": 0.5826, + "step": 10087 + }, + { + "epoch": 3.7239724341619493, + "grad_norm": 0.9389364719390869, + "learning_rate": 1.5904431579726837e-06, + "loss": 0.59, + "step": 10088 + }, + { + "epoch": 3.7243416194929857, + "grad_norm": 0.9288550019264221, + "learning_rate": 1.589990132448388e-06, + "loss": 0.5889, + "step": 10089 + }, + { + "epoch": 3.7247108048240216, + "grad_norm": 0.9208179116249084, + "learning_rate": 1.589537141367678e-06, + "loss": 0.6164, + "step": 10090 + }, + { + "epoch": 3.725079990155058, + "grad_norm": 0.9445948004722595, + "learning_rate": 1.5890841847476995e-06, + "loss": 0.5899, + "step": 10091 + }, + { + "epoch": 3.725449175486094, + "grad_norm": 0.931839644908905, + "learning_rate": 1.588631262605598e-06, + "loss": 0.6353, + "step": 10092 + }, + { + "epoch": 3.7258183608171302, + "grad_norm": 0.9208618402481079, + "learning_rate": 1.588178374958515e-06, + "loss": 0.5758, + "step": 10093 + }, + { + "epoch": 3.726187546148166, + "grad_norm": 0.9220613837242126, + "learning_rate": 1.5877255218235934e-06, + "loss": 0.6345, + "step": 10094 + }, + { + "epoch": 3.7265567314792025, + "grad_norm": 0.8939902186393738, + "learning_rate": 1.5872727032179725e-06, + "loss": 0.5712, + "step": 10095 + }, + { + "epoch": 3.726925916810239, + "grad_norm": 0.955752432346344, + "learning_rate": 1.5868199191587915e-06, + "loss": 0.6083, + "step": 10096 + }, + { + "epoch": 3.727295102141275, + "grad_norm": 0.9902231097221375, + "learning_rate": 1.5863671696631889e-06, + "loss": 0.5818, + "step": 10097 + }, + { + "epoch": 3.727664287472311, + "grad_norm": 0.954149067401886, + "learning_rate": 1.5859144547483e-06, + "loss": 0.5968, + "step": 10098 + }, + { + "epoch": 3.7280334728033475, + "grad_norm": 0.9699732661247253, + "learning_rate": 1.585461774431261e-06, + "loss": 0.568, + "step": 10099 + }, + { + "epoch": 3.7284026581343834, + "grad_norm": 0.9397333264350891, + "learning_rate": 1.5850091287292053e-06, + "loss": 0.5993, + "step": 10100 + }, + { + "epoch": 3.72877184346542, + "grad_norm": 0.904443621635437, + "learning_rate": 1.5845565176592641e-06, + "loss": 0.5802, + "step": 10101 + }, + { + "epoch": 3.7291410287964557, + "grad_norm": 0.9191891551017761, + "learning_rate": 1.584103941238571e-06, + "loss": 0.5598, + "step": 10102 + }, + { + "epoch": 3.729510214127492, + "grad_norm": 0.9518676996231079, + "learning_rate": 1.5836513994842542e-06, + "loss": 0.6029, + "step": 10103 + }, + { + "epoch": 3.729879399458528, + "grad_norm": 0.9681392312049866, + "learning_rate": 1.5831988924134433e-06, + "loss": 0.587, + "step": 10104 + }, + { + "epoch": 3.7302485847895643, + "grad_norm": 0.9158324599266052, + "learning_rate": 1.5827464200432643e-06, + "loss": 0.5855, + "step": 10105 + }, + { + "epoch": 3.7306177701206007, + "grad_norm": 0.9716649651527405, + "learning_rate": 1.582293982390844e-06, + "loss": 0.5981, + "step": 10106 + }, + { + "epoch": 3.7309869554516366, + "grad_norm": 0.9130032658576965, + "learning_rate": 1.5818415794733072e-06, + "loss": 0.5685, + "step": 10107 + }, + { + "epoch": 3.731356140782673, + "grad_norm": 0.906913697719574, + "learning_rate": 1.5813892113077764e-06, + "loss": 0.5804, + "step": 10108 + }, + { + "epoch": 3.7317253261137093, + "grad_norm": 0.9158681035041809, + "learning_rate": 1.5809368779113742e-06, + "loss": 0.5693, + "step": 10109 + }, + { + "epoch": 3.7320945114447452, + "grad_norm": 1.0033365488052368, + "learning_rate": 1.5804845793012214e-06, + "loss": 0.6296, + "step": 10110 + }, + { + "epoch": 3.7324636967757816, + "grad_norm": 0.9488070607185364, + "learning_rate": 1.5800323154944357e-06, + "loss": 0.5852, + "step": 10111 + }, + { + "epoch": 3.7328328821068175, + "grad_norm": 0.9282041192054749, + "learning_rate": 1.5795800865081379e-06, + "loss": 0.6104, + "step": 10112 + }, + { + "epoch": 3.733202067437854, + "grad_norm": 0.9698814153671265, + "learning_rate": 1.5791278923594427e-06, + "loss": 0.6194, + "step": 10113 + }, + { + "epoch": 3.73357125276889, + "grad_norm": 0.9219864010810852, + "learning_rate": 1.578675733065467e-06, + "loss": 0.5719, + "step": 10114 + }, + { + "epoch": 3.733940438099926, + "grad_norm": 0.949013352394104, + "learning_rate": 1.5782236086433233e-06, + "loss": 0.5966, + "step": 10115 + }, + { + "epoch": 3.7343096234309625, + "grad_norm": 0.943129301071167, + "learning_rate": 1.577771519110125e-06, + "loss": 0.6046, + "step": 10116 + }, + { + "epoch": 3.7346788087619984, + "grad_norm": 0.9508444666862488, + "learning_rate": 1.5773194644829843e-06, + "loss": 0.6051, + "step": 10117 + }, + { + "epoch": 3.735047994093035, + "grad_norm": 0.9209508895874023, + "learning_rate": 1.57686744477901e-06, + "loss": 0.5598, + "step": 10118 + }, + { + "epoch": 3.735417179424071, + "grad_norm": 0.9311398863792419, + "learning_rate": 1.5764154600153114e-06, + "loss": 0.6017, + "step": 10119 + }, + { + "epoch": 3.735786364755107, + "grad_norm": 0.941648542881012, + "learning_rate": 1.5759635102089973e-06, + "loss": 0.6104, + "step": 10120 + }, + { + "epoch": 3.736155550086143, + "grad_norm": 0.9195429682731628, + "learning_rate": 1.5755115953771715e-06, + "loss": 0.5614, + "step": 10121 + }, + { + "epoch": 3.7365247354171793, + "grad_norm": 0.9195142984390259, + "learning_rate": 1.5750597155369408e-06, + "loss": 0.5809, + "step": 10122 + }, + { + "epoch": 3.7368939207482157, + "grad_norm": 0.9135737419128418, + "learning_rate": 1.5746078707054074e-06, + "loss": 0.5778, + "step": 10123 + }, + { + "epoch": 3.7372631060792516, + "grad_norm": 0.9532299637794495, + "learning_rate": 1.5741560608996743e-06, + "loss": 0.6117, + "step": 10124 + }, + { + "epoch": 3.737632291410288, + "grad_norm": 0.9508892893791199, + "learning_rate": 1.5737042861368424e-06, + "loss": 0.6008, + "step": 10125 + }, + { + "epoch": 3.7380014767413243, + "grad_norm": 0.8821525573730469, + "learning_rate": 1.5732525464340104e-06, + "loss": 0.5471, + "step": 10126 + }, + { + "epoch": 3.7383706620723602, + "grad_norm": 0.9317185878753662, + "learning_rate": 1.5728008418082778e-06, + "loss": 0.5885, + "step": 10127 + }, + { + "epoch": 3.7387398474033966, + "grad_norm": 0.9128114581108093, + "learning_rate": 1.5723491722767397e-06, + "loss": 0.5822, + "step": 10128 + }, + { + "epoch": 3.739109032734433, + "grad_norm": 0.9074743390083313, + "learning_rate": 1.5718975378564926e-06, + "loss": 0.6189, + "step": 10129 + }, + { + "epoch": 3.739478218065469, + "grad_norm": 0.9362993240356445, + "learning_rate": 1.5714459385646325e-06, + "loss": 0.5829, + "step": 10130 + }, + { + "epoch": 3.739847403396505, + "grad_norm": 0.9283797740936279, + "learning_rate": 1.5709943744182486e-06, + "loss": 0.5884, + "step": 10131 + }, + { + "epoch": 3.740216588727541, + "grad_norm": 0.9165966510772705, + "learning_rate": 1.5705428454344355e-06, + "loss": 0.5903, + "step": 10132 + }, + { + "epoch": 3.7405857740585775, + "grad_norm": 0.9078398942947388, + "learning_rate": 1.5700913516302817e-06, + "loss": 0.5841, + "step": 10133 + }, + { + "epoch": 3.7409549593896134, + "grad_norm": 0.9092011451721191, + "learning_rate": 1.5696398930228772e-06, + "loss": 0.5901, + "step": 10134 + }, + { + "epoch": 3.74132414472065, + "grad_norm": 0.9692609310150146, + "learning_rate": 1.5691884696293094e-06, + "loss": 0.5508, + "step": 10135 + }, + { + "epoch": 3.741693330051686, + "grad_norm": 0.9082786440849304, + "learning_rate": 1.5687370814666635e-06, + "loss": 0.6039, + "step": 10136 + }, + { + "epoch": 3.742062515382722, + "grad_norm": 0.926125168800354, + "learning_rate": 1.5682857285520262e-06, + "loss": 0.5814, + "step": 10137 + }, + { + "epoch": 3.7424317007137584, + "grad_norm": 0.9222734570503235, + "learning_rate": 1.567834410902479e-06, + "loss": 0.5844, + "step": 10138 + }, + { + "epoch": 3.7428008860447943, + "grad_norm": 0.9533694982528687, + "learning_rate": 1.5673831285351048e-06, + "loss": 0.582, + "step": 10139 + }, + { + "epoch": 3.7431700713758307, + "grad_norm": 0.9451269507408142, + "learning_rate": 1.5669318814669865e-06, + "loss": 0.6021, + "step": 10140 + }, + { + "epoch": 3.7435392567068666, + "grad_norm": 0.9289273619651794, + "learning_rate": 1.5664806697152008e-06, + "loss": 0.5839, + "step": 10141 + }, + { + "epoch": 3.743908442037903, + "grad_norm": 0.9478182792663574, + "learning_rate": 1.5660294932968275e-06, + "loss": 0.612, + "step": 10142 + }, + { + "epoch": 3.7442776273689393, + "grad_norm": 0.9443728923797607, + "learning_rate": 1.565578352228943e-06, + "loss": 0.5995, + "step": 10143 + }, + { + "epoch": 3.7446468126999752, + "grad_norm": 0.925834059715271, + "learning_rate": 1.5651272465286227e-06, + "loss": 0.6115, + "step": 10144 + }, + { + "epoch": 3.7450159980310116, + "grad_norm": 0.9486260414123535, + "learning_rate": 1.564676176212942e-06, + "loss": 0.6234, + "step": 10145 + }, + { + "epoch": 3.745385183362048, + "grad_norm": 0.9489620923995972, + "learning_rate": 1.5642251412989717e-06, + "loss": 0.5845, + "step": 10146 + }, + { + "epoch": 3.745754368693084, + "grad_norm": 0.9383641481399536, + "learning_rate": 1.5637741418037854e-06, + "loss": 0.6163, + "step": 10147 + }, + { + "epoch": 3.7461235540241202, + "grad_norm": 0.9138391613960266, + "learning_rate": 1.5633231777444518e-06, + "loss": 0.6111, + "step": 10148 + }, + { + "epoch": 3.746492739355156, + "grad_norm": 0.9183059930801392, + "learning_rate": 1.5628722491380398e-06, + "loss": 0.5739, + "step": 10149 + }, + { + "epoch": 3.7468619246861925, + "grad_norm": 0.9191286563873291, + "learning_rate": 1.5624213560016187e-06, + "loss": 0.5835, + "step": 10150 + }, + { + "epoch": 3.7472311100172284, + "grad_norm": 0.9199957847595215, + "learning_rate": 1.5619704983522522e-06, + "loss": 0.5833, + "step": 10151 + }, + { + "epoch": 3.7476002953482648, + "grad_norm": 0.9489766955375671, + "learning_rate": 1.5615196762070072e-06, + "loss": 0.6144, + "step": 10152 + }, + { + "epoch": 3.747969480679301, + "grad_norm": 0.9188354015350342, + "learning_rate": 1.5610688895829457e-06, + "loss": 0.564, + "step": 10153 + }, + { + "epoch": 3.748338666010337, + "grad_norm": 0.9164754748344421, + "learning_rate": 1.5606181384971303e-06, + "loss": 0.5855, + "step": 10154 + }, + { + "epoch": 3.7487078513413734, + "grad_norm": 0.8974734544754028, + "learning_rate": 1.5601674229666225e-06, + "loss": 0.5434, + "step": 10155 + }, + { + "epoch": 3.7490770366724098, + "grad_norm": 0.8953746557235718, + "learning_rate": 1.5597167430084808e-06, + "loss": 0.5723, + "step": 10156 + }, + { + "epoch": 3.7494462220034457, + "grad_norm": 0.9474878907203674, + "learning_rate": 1.5592660986397628e-06, + "loss": 0.6159, + "step": 10157 + }, + { + "epoch": 3.749815407334482, + "grad_norm": 0.931199848651886, + "learning_rate": 1.5588154898775276e-06, + "loss": 0.5745, + "step": 10158 + }, + { + "epoch": 3.750184592665518, + "grad_norm": 0.952867865562439, + "learning_rate": 1.5583649167388276e-06, + "loss": 0.6074, + "step": 10159 + }, + { + "epoch": 3.7505537779965543, + "grad_norm": 0.932223916053772, + "learning_rate": 1.55791437924072e-06, + "loss": 0.6094, + "step": 10160 + }, + { + "epoch": 3.7509229633275902, + "grad_norm": 0.9659913778305054, + "learning_rate": 1.5574638774002543e-06, + "loss": 0.602, + "step": 10161 + }, + { + "epoch": 3.7512921486586266, + "grad_norm": 0.932474672794342, + "learning_rate": 1.5570134112344836e-06, + "loss": 0.6234, + "step": 10162 + }, + { + "epoch": 3.751661333989663, + "grad_norm": 0.924612820148468, + "learning_rate": 1.5565629807604586e-06, + "loss": 0.6074, + "step": 10163 + }, + { + "epoch": 3.752030519320699, + "grad_norm": 0.9344332814216614, + "learning_rate": 1.556112585995226e-06, + "loss": 0.608, + "step": 10164 + }, + { + "epoch": 3.7523997046517352, + "grad_norm": 0.9472365379333496, + "learning_rate": 1.555662226955835e-06, + "loss": 0.5897, + "step": 10165 + }, + { + "epoch": 3.7527688899827716, + "grad_norm": 0.9292339086532593, + "learning_rate": 1.5552119036593298e-06, + "loss": 0.657, + "step": 10166 + }, + { + "epoch": 3.7531380753138075, + "grad_norm": 0.9320264458656311, + "learning_rate": 1.5547616161227558e-06, + "loss": 0.601, + "step": 10167 + }, + { + "epoch": 3.753507260644844, + "grad_norm": 0.9471850395202637, + "learning_rate": 1.5543113643631578e-06, + "loss": 0.5951, + "step": 10168 + }, + { + "epoch": 3.7538764459758798, + "grad_norm": 0.923704981803894, + "learning_rate": 1.5538611483975746e-06, + "loss": 0.579, + "step": 10169 + }, + { + "epoch": 3.754245631306916, + "grad_norm": 0.9205948710441589, + "learning_rate": 1.5534109682430498e-06, + "loss": 0.5815, + "step": 10170 + }, + { + "epoch": 3.754614816637952, + "grad_norm": 0.9614802598953247, + "learning_rate": 1.5529608239166193e-06, + "loss": 0.6366, + "step": 10171 + }, + { + "epoch": 3.7549840019689884, + "grad_norm": 0.9327760338783264, + "learning_rate": 1.5525107154353237e-06, + "loss": 0.5871, + "step": 10172 + }, + { + "epoch": 3.7553531873000248, + "grad_norm": 0.9420748949050903, + "learning_rate": 1.552060642816199e-06, + "loss": 0.6285, + "step": 10173 + }, + { + "epoch": 3.7557223726310607, + "grad_norm": 0.9319185018539429, + "learning_rate": 1.5516106060762787e-06, + "loss": 0.5703, + "step": 10174 + }, + { + "epoch": 3.756091557962097, + "grad_norm": 0.9286710619926453, + "learning_rate": 1.5511606052325986e-06, + "loss": 0.6125, + "step": 10175 + }, + { + "epoch": 3.7564607432931334, + "grad_norm": 0.9180852174758911, + "learning_rate": 1.5507106403021897e-06, + "loss": 0.5853, + "step": 10176 + }, + { + "epoch": 3.7568299286241693, + "grad_norm": 0.9416006803512573, + "learning_rate": 1.5502607113020825e-06, + "loss": 0.6031, + "step": 10177 + }, + { + "epoch": 3.7571991139552057, + "grad_norm": 0.926967978477478, + "learning_rate": 1.5498108182493096e-06, + "loss": 0.5919, + "step": 10178 + }, + { + "epoch": 3.7575682992862416, + "grad_norm": 0.9437499046325684, + "learning_rate": 1.5493609611608957e-06, + "loss": 0.5688, + "step": 10179 + }, + { + "epoch": 3.757937484617278, + "grad_norm": 0.9082320928573608, + "learning_rate": 1.548911140053871e-06, + "loss": 0.5714, + "step": 10180 + }, + { + "epoch": 3.758306669948314, + "grad_norm": 0.9492562413215637, + "learning_rate": 1.5484613549452577e-06, + "loss": 0.6238, + "step": 10181 + }, + { + "epoch": 3.75867585527935, + "grad_norm": 0.9216838479042053, + "learning_rate": 1.5480116058520829e-06, + "loss": 0.5531, + "step": 10182 + }, + { + "epoch": 3.7590450406103866, + "grad_norm": 0.9688727259635925, + "learning_rate": 1.5475618927913683e-06, + "loss": 0.601, + "step": 10183 + }, + { + "epoch": 3.7594142259414225, + "grad_norm": 0.9493721723556519, + "learning_rate": 1.5471122157801348e-06, + "loss": 0.5799, + "step": 10184 + }, + { + "epoch": 3.759783411272459, + "grad_norm": 0.9424644708633423, + "learning_rate": 1.5466625748354042e-06, + "loss": 0.6015, + "step": 10185 + }, + { + "epoch": 3.760152596603495, + "grad_norm": 0.9503704309463501, + "learning_rate": 1.5462129699741934e-06, + "loss": 0.5666, + "step": 10186 + }, + { + "epoch": 3.760521781934531, + "grad_norm": 0.9419312477111816, + "learning_rate": 1.5457634012135203e-06, + "loss": 0.5666, + "step": 10187 + }, + { + "epoch": 3.760890967265567, + "grad_norm": 0.9074030518531799, + "learning_rate": 1.5453138685704027e-06, + "loss": 0.6179, + "step": 10188 + }, + { + "epoch": 3.7612601525966034, + "grad_norm": 0.9452353119850159, + "learning_rate": 1.5448643720618528e-06, + "loss": 0.5812, + "step": 10189 + }, + { + "epoch": 3.7616293379276398, + "grad_norm": 0.9358368515968323, + "learning_rate": 1.5444149117048862e-06, + "loss": 0.5529, + "step": 10190 + }, + { + "epoch": 3.7619985232586757, + "grad_norm": 0.9326780438423157, + "learning_rate": 1.5439654875165122e-06, + "loss": 0.5952, + "step": 10191 + }, + { + "epoch": 3.762367708589712, + "grad_norm": 0.9325920343399048, + "learning_rate": 1.543516099513743e-06, + "loss": 0.6257, + "step": 10192 + }, + { + "epoch": 3.7627368939207484, + "grad_norm": 0.9140171408653259, + "learning_rate": 1.5430667477135883e-06, + "loss": 0.627, + "step": 10193 + }, + { + "epoch": 3.7631060792517843, + "grad_norm": 0.9093880653381348, + "learning_rate": 1.5426174321330545e-06, + "loss": 0.5744, + "step": 10194 + }, + { + "epoch": 3.7634752645828207, + "grad_norm": 0.9350526928901672, + "learning_rate": 1.5421681527891485e-06, + "loss": 0.5843, + "step": 10195 + }, + { + "epoch": 3.763844449913857, + "grad_norm": 0.938617467880249, + "learning_rate": 1.5417189096988768e-06, + "loss": 0.6157, + "step": 10196 + }, + { + "epoch": 3.764213635244893, + "grad_norm": 0.9495716094970703, + "learning_rate": 1.5412697028792407e-06, + "loss": 0.6078, + "step": 10197 + }, + { + "epoch": 3.764582820575929, + "grad_norm": 0.9798267483711243, + "learning_rate": 1.5408205323472453e-06, + "loss": 0.5639, + "step": 10198 + }, + { + "epoch": 3.764952005906965, + "grad_norm": 0.9145382642745972, + "learning_rate": 1.540371398119888e-06, + "loss": 0.568, + "step": 10199 + }, + { + "epoch": 3.7653211912380016, + "grad_norm": 0.9180849194526672, + "learning_rate": 1.5399223002141717e-06, + "loss": 0.5915, + "step": 10200 + }, + { + "epoch": 3.7656903765690375, + "grad_norm": 0.9559164047241211, + "learning_rate": 1.5394732386470935e-06, + "loss": 0.6244, + "step": 10201 + }, + { + "epoch": 3.766059561900074, + "grad_norm": 0.9486687183380127, + "learning_rate": 1.5390242134356492e-06, + "loss": 0.5908, + "step": 10202 + }, + { + "epoch": 3.76642874723111, + "grad_norm": 0.9333081841468811, + "learning_rate": 1.538575224596836e-06, + "loss": 0.5939, + "step": 10203 + }, + { + "epoch": 3.766797932562146, + "grad_norm": 0.9214265942573547, + "learning_rate": 1.5381262721476465e-06, + "loss": 0.5773, + "step": 10204 + }, + { + "epoch": 3.7671671178931825, + "grad_norm": 0.925331175327301, + "learning_rate": 1.5376773561050733e-06, + "loss": 0.6012, + "step": 10205 + }, + { + "epoch": 3.7675363032242184, + "grad_norm": 0.9432594776153564, + "learning_rate": 1.5372284764861106e-06, + "loss": 0.6157, + "step": 10206 + }, + { + "epoch": 3.7679054885552548, + "grad_norm": 0.9492666721343994, + "learning_rate": 1.5367796333077444e-06, + "loss": 0.5951, + "step": 10207 + }, + { + "epoch": 3.7682746738862907, + "grad_norm": 0.9278252720832825, + "learning_rate": 1.5363308265869663e-06, + "loss": 0.577, + "step": 10208 + }, + { + "epoch": 3.768643859217327, + "grad_norm": 1.0623453855514526, + "learning_rate": 1.5358820563407613e-06, + "loss": 0.5729, + "step": 10209 + }, + { + "epoch": 3.7690130445483634, + "grad_norm": 0.9358916878700256, + "learning_rate": 1.5354333225861162e-06, + "loss": 0.5709, + "step": 10210 + }, + { + "epoch": 3.7693822298793993, + "grad_norm": 0.9141185283660889, + "learning_rate": 1.5349846253400163e-06, + "loss": 0.5827, + "step": 10211 + }, + { + "epoch": 3.7697514152104357, + "grad_norm": 0.9232922792434692, + "learning_rate": 1.534535964619443e-06, + "loss": 0.5937, + "step": 10212 + }, + { + "epoch": 3.770120600541472, + "grad_norm": 0.936638355255127, + "learning_rate": 1.5340873404413792e-06, + "loss": 0.5915, + "step": 10213 + }, + { + "epoch": 3.770489785872508, + "grad_norm": 0.922877311706543, + "learning_rate": 1.5336387528228044e-06, + "loss": 0.5831, + "step": 10214 + }, + { + "epoch": 3.7708589712035443, + "grad_norm": 0.9458206295967102, + "learning_rate": 1.533190201780697e-06, + "loss": 0.583, + "step": 10215 + }, + { + "epoch": 3.77122815653458, + "grad_norm": 0.9818943738937378, + "learning_rate": 1.5327416873320372e-06, + "loss": 0.6263, + "step": 10216 + }, + { + "epoch": 3.7715973418656166, + "grad_norm": 1.0535231828689575, + "learning_rate": 1.532293209493798e-06, + "loss": 0.5716, + "step": 10217 + }, + { + "epoch": 3.7719665271966525, + "grad_norm": 0.911591649055481, + "learning_rate": 1.5318447682829563e-06, + "loss": 0.5608, + "step": 10218 + }, + { + "epoch": 3.772335712527689, + "grad_norm": 0.9380531311035156, + "learning_rate": 1.5313963637164837e-06, + "loss": 0.5899, + "step": 10219 + }, + { + "epoch": 3.772704897858725, + "grad_norm": 0.9625257253646851, + "learning_rate": 1.5309479958113527e-06, + "loss": 0.6233, + "step": 10220 + }, + { + "epoch": 3.773074083189761, + "grad_norm": 0.922096312046051, + "learning_rate": 1.5304996645845354e-06, + "loss": 0.5475, + "step": 10221 + }, + { + "epoch": 3.7734432685207975, + "grad_norm": 1.0308096408843994, + "learning_rate": 1.530051370052999e-06, + "loss": 0.5866, + "step": 10222 + }, + { + "epoch": 3.773812453851834, + "grad_norm": 0.9503872394561768, + "learning_rate": 1.529603112233713e-06, + "loss": 0.5695, + "step": 10223 + }, + { + "epoch": 3.7741816391828698, + "grad_norm": 0.9400650262832642, + "learning_rate": 1.529154891143642e-06, + "loss": 0.5996, + "step": 10224 + }, + { + "epoch": 3.774550824513906, + "grad_norm": 0.9157291054725647, + "learning_rate": 1.5287067067997516e-06, + "loss": 0.6129, + "step": 10225 + }, + { + "epoch": 3.774920009844942, + "grad_norm": 0.9504362344741821, + "learning_rate": 1.5282585592190075e-06, + "loss": 0.5976, + "step": 10226 + }, + { + "epoch": 3.7752891951759784, + "grad_norm": 0.9198172688484192, + "learning_rate": 1.5278104484183684e-06, + "loss": 0.5888, + "step": 10227 + }, + { + "epoch": 3.7756583805070143, + "grad_norm": 0.9314798712730408, + "learning_rate": 1.5273623744147987e-06, + "loss": 0.5591, + "step": 10228 + }, + { + "epoch": 3.7760275658380507, + "grad_norm": 0.933719277381897, + "learning_rate": 1.5269143372252546e-06, + "loss": 0.6013, + "step": 10229 + }, + { + "epoch": 3.776396751169087, + "grad_norm": 0.9437204003334045, + "learning_rate": 1.5264663368666962e-06, + "loss": 0.6163, + "step": 10230 + }, + { + "epoch": 3.776765936500123, + "grad_norm": 0.9553722739219666, + "learning_rate": 1.52601837335608e-06, + "loss": 0.5783, + "step": 10231 + }, + { + "epoch": 3.7771351218311593, + "grad_norm": 0.9459050297737122, + "learning_rate": 1.525570446710361e-06, + "loss": 0.6015, + "step": 10232 + }, + { + "epoch": 3.7775043071621957, + "grad_norm": 0.9378347992897034, + "learning_rate": 1.5251225569464927e-06, + "loss": 0.5564, + "step": 10233 + }, + { + "epoch": 3.7778734924932316, + "grad_norm": 0.9543257355690002, + "learning_rate": 1.5246747040814286e-06, + "loss": 0.5397, + "step": 10234 + }, + { + "epoch": 3.778242677824268, + "grad_norm": 0.9142382740974426, + "learning_rate": 1.5242268881321182e-06, + "loss": 0.5648, + "step": 10235 + }, + { + "epoch": 3.778611863155304, + "grad_norm": 0.9672378897666931, + "learning_rate": 1.5237791091155135e-06, + "loss": 0.5995, + "step": 10236 + }, + { + "epoch": 3.77898104848634, + "grad_norm": 0.9535740613937378, + "learning_rate": 1.52333136704856e-06, + "loss": 0.5781, + "step": 10237 + }, + { + "epoch": 3.779350233817376, + "grad_norm": 0.9110767245292664, + "learning_rate": 1.5228836619482061e-06, + "loss": 0.5776, + "step": 10238 + }, + { + "epoch": 3.7797194191484125, + "grad_norm": 0.8953941464424133, + "learning_rate": 1.5224359938313982e-06, + "loss": 0.5511, + "step": 10239 + }, + { + "epoch": 3.780088604479449, + "grad_norm": 0.943831741809845, + "learning_rate": 1.521988362715079e-06, + "loss": 0.6203, + "step": 10240 + }, + { + "epoch": 3.7804577898104847, + "grad_norm": 0.9267343878746033, + "learning_rate": 1.5215407686161915e-06, + "loss": 0.6105, + "step": 10241 + }, + { + "epoch": 3.780826975141521, + "grad_norm": 0.9378159642219543, + "learning_rate": 1.5210932115516772e-06, + "loss": 0.58, + "step": 10242 + }, + { + "epoch": 3.7811961604725575, + "grad_norm": 0.9367536902427673, + "learning_rate": 1.5206456915384755e-06, + "loss": 0.5797, + "step": 10243 + }, + { + "epoch": 3.7815653458035934, + "grad_norm": 0.9198850989341736, + "learning_rate": 1.520198208593526e-06, + "loss": 0.6067, + "step": 10244 + }, + { + "epoch": 3.7819345311346297, + "grad_norm": 0.9194225668907166, + "learning_rate": 1.519750762733764e-06, + "loss": 0.5952, + "step": 10245 + }, + { + "epoch": 3.7823037164656657, + "grad_norm": 0.9239737391471863, + "learning_rate": 1.519303353976128e-06, + "loss": 0.5795, + "step": 10246 + }, + { + "epoch": 3.782672901796702, + "grad_norm": 0.9279195070266724, + "learning_rate": 1.5188559823375487e-06, + "loss": 0.582, + "step": 10247 + }, + { + "epoch": 3.783042087127738, + "grad_norm": 0.9630566239356995, + "learning_rate": 1.5184086478349613e-06, + "loss": 0.5673, + "step": 10248 + }, + { + "epoch": 3.7834112724587743, + "grad_norm": 0.9439097046852112, + "learning_rate": 1.5179613504852975e-06, + "loss": 0.5718, + "step": 10249 + }, + { + "epoch": 3.7837804577898106, + "grad_norm": 0.930411696434021, + "learning_rate": 1.517514090305486e-06, + "loss": 0.6137, + "step": 10250 + }, + { + "epoch": 3.7841496431208466, + "grad_norm": 0.9431036710739136, + "learning_rate": 1.5170668673124567e-06, + "loss": 0.5783, + "step": 10251 + }, + { + "epoch": 3.784518828451883, + "grad_norm": 0.9267950654029846, + "learning_rate": 1.5166196815231357e-06, + "loss": 0.557, + "step": 10252 + }, + { + "epoch": 3.7848880137829193, + "grad_norm": 0.932884931564331, + "learning_rate": 1.5161725329544497e-06, + "loss": 0.5885, + "step": 10253 + }, + { + "epoch": 3.785257199113955, + "grad_norm": 0.9334838390350342, + "learning_rate": 1.515725421623323e-06, + "loss": 0.5772, + "step": 10254 + }, + { + "epoch": 3.785626384444991, + "grad_norm": 0.9536572694778442, + "learning_rate": 1.5152783475466772e-06, + "loss": 0.587, + "step": 10255 + }, + { + "epoch": 3.7859955697760275, + "grad_norm": 0.9225651025772095, + "learning_rate": 1.514831310741437e-06, + "loss": 0.5791, + "step": 10256 + }, + { + "epoch": 3.786364755107064, + "grad_norm": 0.9198220372200012, + "learning_rate": 1.5143843112245194e-06, + "loss": 0.5563, + "step": 10257 + }, + { + "epoch": 3.7867339404380997, + "grad_norm": 0.9129364490509033, + "learning_rate": 1.5139373490128447e-06, + "loss": 0.576, + "step": 10258 + }, + { + "epoch": 3.787103125769136, + "grad_norm": 0.9304227828979492, + "learning_rate": 1.513490424123331e-06, + "loss": 0.581, + "step": 10259 + }, + { + "epoch": 3.7874723111001725, + "grad_norm": 0.9094902873039246, + "learning_rate": 1.5130435365728928e-06, + "loss": 0.5878, + "step": 10260 + }, + { + "epoch": 3.7878414964312084, + "grad_norm": 0.9298146963119507, + "learning_rate": 1.5125966863784456e-06, + "loss": 0.6402, + "step": 10261 + }, + { + "epoch": 3.7882106817622447, + "grad_norm": 0.9130014181137085, + "learning_rate": 1.5121498735569018e-06, + "loss": 0.584, + "step": 10262 + }, + { + "epoch": 3.788579867093281, + "grad_norm": 0.955388605594635, + "learning_rate": 1.5117030981251736e-06, + "loss": 0.618, + "step": 10263 + }, + { + "epoch": 3.788949052424317, + "grad_norm": 0.9436842203140259, + "learning_rate": 1.511256360100171e-06, + "loss": 0.614, + "step": 10264 + }, + { + "epoch": 3.789318237755353, + "grad_norm": 0.9156129360198975, + "learning_rate": 1.510809659498803e-06, + "loss": 0.5755, + "step": 10265 + }, + { + "epoch": 3.7896874230863893, + "grad_norm": 0.950386106967926, + "learning_rate": 1.5103629963379784e-06, + "loss": 0.6326, + "step": 10266 + }, + { + "epoch": 3.7900566084174256, + "grad_norm": 0.9464142322540283, + "learning_rate": 1.5099163706346005e-06, + "loss": 0.5707, + "step": 10267 + }, + { + "epoch": 3.7904257937484616, + "grad_norm": 0.929446816444397, + "learning_rate": 1.5094697824055759e-06, + "loss": 0.6204, + "step": 10268 + }, + { + "epoch": 3.790794979079498, + "grad_norm": 0.9328938722610474, + "learning_rate": 1.5090232316678084e-06, + "loss": 0.5866, + "step": 10269 + }, + { + "epoch": 3.7911641644105343, + "grad_norm": 0.9021822214126587, + "learning_rate": 1.5085767184381978e-06, + "loss": 0.5835, + "step": 10270 + }, + { + "epoch": 3.79153334974157, + "grad_norm": 0.9419955015182495, + "learning_rate": 1.5081302427336458e-06, + "loss": 0.6071, + "step": 10271 + }, + { + "epoch": 3.7919025350726066, + "grad_norm": 0.9217768907546997, + "learning_rate": 1.5076838045710515e-06, + "loss": 0.5551, + "step": 10272 + }, + { + "epoch": 3.7922717204036425, + "grad_norm": 0.9295209050178528, + "learning_rate": 1.5072374039673116e-06, + "loss": 0.5718, + "step": 10273 + }, + { + "epoch": 3.792640905734679, + "grad_norm": 0.9437929391860962, + "learning_rate": 1.506791040939323e-06, + "loss": 0.6267, + "step": 10274 + }, + { + "epoch": 3.7930100910657147, + "grad_norm": 0.9459223747253418, + "learning_rate": 1.506344715503979e-06, + "loss": 0.5686, + "step": 10275 + }, + { + "epoch": 3.793379276396751, + "grad_norm": 0.9642034769058228, + "learning_rate": 1.5058984276781746e-06, + "loss": 0.5848, + "step": 10276 + }, + { + "epoch": 3.7937484617277875, + "grad_norm": 0.9542469382286072, + "learning_rate": 1.5054521774788011e-06, + "loss": 0.6131, + "step": 10277 + }, + { + "epoch": 3.7941176470588234, + "grad_norm": 0.9355142712593079, + "learning_rate": 1.5050059649227488e-06, + "loss": 0.5688, + "step": 10278 + }, + { + "epoch": 3.7944868323898597, + "grad_norm": 0.9260144233703613, + "learning_rate": 1.504559790026907e-06, + "loss": 0.602, + "step": 10279 + }, + { + "epoch": 3.794856017720896, + "grad_norm": 0.9252691864967346, + "learning_rate": 1.5041136528081623e-06, + "loss": 0.5822, + "step": 10280 + }, + { + "epoch": 3.795225203051932, + "grad_norm": 0.9097942113876343, + "learning_rate": 1.5036675532834016e-06, + "loss": 0.5875, + "step": 10281 + }, + { + "epoch": 3.7955943883829684, + "grad_norm": 0.9207466244697571, + "learning_rate": 1.5032214914695104e-06, + "loss": 0.565, + "step": 10282 + }, + { + "epoch": 3.7959635737140043, + "grad_norm": 0.9351236820220947, + "learning_rate": 1.5027754673833702e-06, + "loss": 0.5794, + "step": 10283 + }, + { + "epoch": 3.7963327590450406, + "grad_norm": 0.9183593392372131, + "learning_rate": 1.502329481041865e-06, + "loss": 0.5766, + "step": 10284 + }, + { + "epoch": 3.7967019443760766, + "grad_norm": 0.9602499604225159, + "learning_rate": 1.5018835324618722e-06, + "loss": 0.5967, + "step": 10285 + }, + { + "epoch": 3.797071129707113, + "grad_norm": 0.9486150145530701, + "learning_rate": 1.5014376216602738e-06, + "loss": 0.6011, + "step": 10286 + }, + { + "epoch": 3.7974403150381493, + "grad_norm": 0.9438326358795166, + "learning_rate": 1.5009917486539465e-06, + "loss": 0.5905, + "step": 10287 + }, + { + "epoch": 3.797809500369185, + "grad_norm": 0.9414025545120239, + "learning_rate": 1.5005459134597655e-06, + "loss": 0.5933, + "step": 10288 + }, + { + "epoch": 3.7981786857002215, + "grad_norm": 0.9444487690925598, + "learning_rate": 1.500100116094607e-06, + "loss": 0.5837, + "step": 10289 + }, + { + "epoch": 3.798547871031258, + "grad_norm": 0.9226516485214233, + "learning_rate": 1.4996543565753434e-06, + "loss": 0.5545, + "step": 10290 + }, + { + "epoch": 3.798917056362294, + "grad_norm": 0.9134663939476013, + "learning_rate": 1.4992086349188463e-06, + "loss": 0.5924, + "step": 10291 + }, + { + "epoch": 3.79928624169333, + "grad_norm": 0.9261674880981445, + "learning_rate": 1.4987629511419872e-06, + "loss": 0.5989, + "step": 10292 + }, + { + "epoch": 3.799655427024366, + "grad_norm": 0.9396654963493347, + "learning_rate": 1.4983173052616333e-06, + "loss": 0.582, + "step": 10293 + }, + { + "epoch": 3.8000246123554025, + "grad_norm": 0.9488776326179504, + "learning_rate": 1.4978716972946544e-06, + "loss": 0.6188, + "step": 10294 + }, + { + "epoch": 3.8003937976864384, + "grad_norm": 0.9400230646133423, + "learning_rate": 1.4974261272579144e-06, + "loss": 0.6254, + "step": 10295 + }, + { + "epoch": 3.8007629830174747, + "grad_norm": 0.8973583579063416, + "learning_rate": 1.4969805951682794e-06, + "loss": 0.5452, + "step": 10296 + }, + { + "epoch": 3.801132168348511, + "grad_norm": 0.9412003755569458, + "learning_rate": 1.496535101042613e-06, + "loss": 0.604, + "step": 10297 + }, + { + "epoch": 3.801501353679547, + "grad_norm": 0.9092232584953308, + "learning_rate": 1.4960896448977758e-06, + "loss": 0.5545, + "step": 10298 + }, + { + "epoch": 3.8018705390105834, + "grad_norm": 0.9523541331291199, + "learning_rate": 1.4956442267506288e-06, + "loss": 0.595, + "step": 10299 + }, + { + "epoch": 3.8022397243416197, + "grad_norm": 0.9126023054122925, + "learning_rate": 1.495198846618031e-06, + "loss": 0.606, + "step": 10300 + }, + { + "epoch": 3.8026089096726556, + "grad_norm": 0.939969539642334, + "learning_rate": 1.4947535045168394e-06, + "loss": 0.594, + "step": 10301 + }, + { + "epoch": 3.802978095003692, + "grad_norm": 0.913698136806488, + "learning_rate": 1.494308200463911e-06, + "loss": 0.581, + "step": 10302 + }, + { + "epoch": 3.803347280334728, + "grad_norm": 0.9045127034187317, + "learning_rate": 1.4938629344760986e-06, + "loss": 0.5915, + "step": 10303 + }, + { + "epoch": 3.8037164656657643, + "grad_norm": 0.9080638885498047, + "learning_rate": 1.4934177065702571e-06, + "loss": 0.5698, + "step": 10304 + }, + { + "epoch": 3.8040856509968, + "grad_norm": 0.9231714606285095, + "learning_rate": 1.492972516763238e-06, + "loss": 0.5982, + "step": 10305 + }, + { + "epoch": 3.8044548363278365, + "grad_norm": 0.9184026718139648, + "learning_rate": 1.4925273650718912e-06, + "loss": 0.569, + "step": 10306 + }, + { + "epoch": 3.804824021658873, + "grad_norm": 0.927729070186615, + "learning_rate": 1.4920822515130662e-06, + "loss": 0.5916, + "step": 10307 + }, + { + "epoch": 3.805193206989909, + "grad_norm": 0.9032055139541626, + "learning_rate": 1.491637176103609e-06, + "loss": 0.582, + "step": 10308 + }, + { + "epoch": 3.805562392320945, + "grad_norm": 0.9454832077026367, + "learning_rate": 1.491192138860367e-06, + "loss": 0.6049, + "step": 10309 + }, + { + "epoch": 3.8059315776519815, + "grad_norm": 0.9128819704055786, + "learning_rate": 1.4907471398001844e-06, + "loss": 0.5964, + "step": 10310 + }, + { + "epoch": 3.8063007629830174, + "grad_norm": 0.9434806704521179, + "learning_rate": 1.4903021789399036e-06, + "loss": 0.6172, + "step": 10311 + }, + { + "epoch": 3.806669948314054, + "grad_norm": 0.9285937547683716, + "learning_rate": 1.489857256296367e-06, + "loss": 0.573, + "step": 10312 + }, + { + "epoch": 3.8070391336450897, + "grad_norm": 0.8895650506019592, + "learning_rate": 1.4894123718864134e-06, + "loss": 0.5286, + "step": 10313 + }, + { + "epoch": 3.807408318976126, + "grad_norm": 0.9417546391487122, + "learning_rate": 1.4889675257268831e-06, + "loss": 0.5756, + "step": 10314 + }, + { + "epoch": 3.807777504307162, + "grad_norm": 0.9381334781646729, + "learning_rate": 1.4885227178346137e-06, + "loss": 0.5934, + "step": 10315 + }, + { + "epoch": 3.8081466896381984, + "grad_norm": 0.9281310439109802, + "learning_rate": 1.4880779482264397e-06, + "loss": 0.5501, + "step": 10316 + }, + { + "epoch": 3.8085158749692347, + "grad_norm": 0.9320988059043884, + "learning_rate": 1.4876332169191959e-06, + "loss": 0.5973, + "step": 10317 + }, + { + "epoch": 3.8088850603002706, + "grad_norm": 0.8974357843399048, + "learning_rate": 1.4871885239297151e-06, + "loss": 0.5395, + "step": 10318 + }, + { + "epoch": 3.809254245631307, + "grad_norm": 0.9452093243598938, + "learning_rate": 1.4867438692748293e-06, + "loss": 0.5585, + "step": 10319 + }, + { + "epoch": 3.8096234309623433, + "grad_norm": 0.9627413153648376, + "learning_rate": 1.4862992529713686e-06, + "loss": 0.5713, + "step": 10320 + }, + { + "epoch": 3.8099926162933793, + "grad_norm": 0.9171698689460754, + "learning_rate": 1.4858546750361607e-06, + "loss": 0.6289, + "step": 10321 + }, + { + "epoch": 3.8103618016244156, + "grad_norm": 0.9273706674575806, + "learning_rate": 1.4854101354860337e-06, + "loss": 0.6027, + "step": 10322 + }, + { + "epoch": 3.8107309869554515, + "grad_norm": 0.9314715266227722, + "learning_rate": 1.4849656343378122e-06, + "loss": 0.6044, + "step": 10323 + }, + { + "epoch": 3.811100172286488, + "grad_norm": 0.9644017815589905, + "learning_rate": 1.484521171608321e-06, + "loss": 0.5845, + "step": 10324 + }, + { + "epoch": 3.811469357617524, + "grad_norm": 0.9341748356819153, + "learning_rate": 1.484076747314384e-06, + "loss": 0.6309, + "step": 10325 + }, + { + "epoch": 3.81183854294856, + "grad_norm": 0.9379871487617493, + "learning_rate": 1.4836323614728206e-06, + "loss": 0.6006, + "step": 10326 + }, + { + "epoch": 3.8122077282795965, + "grad_norm": 0.938424825668335, + "learning_rate": 1.4831880141004524e-06, + "loss": 0.6062, + "step": 10327 + }, + { + "epoch": 3.8125769136106324, + "grad_norm": 0.9297972917556763, + "learning_rate": 1.4827437052140963e-06, + "loss": 0.6235, + "step": 10328 + }, + { + "epoch": 3.812946098941669, + "grad_norm": 0.9505990147590637, + "learning_rate": 1.4822994348305696e-06, + "loss": 0.5945, + "step": 10329 + }, + { + "epoch": 3.813315284272705, + "grad_norm": 0.9140305519104004, + "learning_rate": 1.4818552029666891e-06, + "loss": 0.6049, + "step": 10330 + }, + { + "epoch": 3.813684469603741, + "grad_norm": 0.9357538223266602, + "learning_rate": 1.481411009639267e-06, + "loss": 0.5868, + "step": 10331 + }, + { + "epoch": 3.814053654934777, + "grad_norm": 0.9512758255004883, + "learning_rate": 1.4809668548651174e-06, + "loss": 0.6147, + "step": 10332 + }, + { + "epoch": 3.8144228402658134, + "grad_norm": 0.9186844229698181, + "learning_rate": 1.4805227386610499e-06, + "loss": 0.5948, + "step": 10333 + }, + { + "epoch": 3.8147920255968497, + "grad_norm": 0.941489040851593, + "learning_rate": 1.480078661043875e-06, + "loss": 0.5989, + "step": 10334 + }, + { + "epoch": 3.8151612109278856, + "grad_norm": 0.9632973670959473, + "learning_rate": 1.479634622030402e-06, + "loss": 0.5967, + "step": 10335 + }, + { + "epoch": 3.815530396258922, + "grad_norm": 0.9670729637145996, + "learning_rate": 1.4791906216374354e-06, + "loss": 0.6115, + "step": 10336 + }, + { + "epoch": 3.8158995815899583, + "grad_norm": 0.9216572642326355, + "learning_rate": 1.4787466598817823e-06, + "loss": 0.6186, + "step": 10337 + }, + { + "epoch": 3.8162687669209943, + "grad_norm": 0.947258472442627, + "learning_rate": 1.4783027367802455e-06, + "loss": 0.6237, + "step": 10338 + }, + { + "epoch": 3.8166379522520306, + "grad_norm": 0.9493112564086914, + "learning_rate": 1.4778588523496276e-06, + "loss": 0.6239, + "step": 10339 + }, + { + "epoch": 3.8170071375830665, + "grad_norm": 0.9353529810905457, + "learning_rate": 1.47741500660673e-06, + "loss": 0.5732, + "step": 10340 + }, + { + "epoch": 3.817376322914103, + "grad_norm": 0.9144046306610107, + "learning_rate": 1.476971199568351e-06, + "loss": 0.6007, + "step": 10341 + }, + { + "epoch": 3.817745508245139, + "grad_norm": 0.9600425958633423, + "learning_rate": 1.4765274312512889e-06, + "loss": 0.5954, + "step": 10342 + }, + { + "epoch": 3.818114693576175, + "grad_norm": 0.909868597984314, + "learning_rate": 1.4760837016723417e-06, + "loss": 0.5588, + "step": 10343 + }, + { + "epoch": 3.8184838789072115, + "grad_norm": 0.9369348287582397, + "learning_rate": 1.4756400108483026e-06, + "loss": 0.5856, + "step": 10344 + }, + { + "epoch": 3.8188530642382474, + "grad_norm": 0.9384814500808716, + "learning_rate": 1.4751963587959662e-06, + "loss": 0.6201, + "step": 10345 + }, + { + "epoch": 3.819222249569284, + "grad_norm": 0.9229404330253601, + "learning_rate": 1.4747527455321237e-06, + "loss": 0.5905, + "step": 10346 + }, + { + "epoch": 3.81959143490032, + "grad_norm": 0.9735158085823059, + "learning_rate": 1.4743091710735662e-06, + "loss": 0.62, + "step": 10347 + }, + { + "epoch": 3.819960620231356, + "grad_norm": 0.9603357315063477, + "learning_rate": 1.4738656354370833e-06, + "loss": 0.6116, + "step": 10348 + }, + { + "epoch": 3.8203298055623924, + "grad_norm": 0.938908576965332, + "learning_rate": 1.473422138639462e-06, + "loss": 0.5766, + "step": 10349 + }, + { + "epoch": 3.8206989908934283, + "grad_norm": 0.9505903124809265, + "learning_rate": 1.4729786806974891e-06, + "loss": 0.6066, + "step": 10350 + }, + { + "epoch": 3.8210681762244647, + "grad_norm": 0.9186946749687195, + "learning_rate": 1.4725352616279487e-06, + "loss": 0.6068, + "step": 10351 + }, + { + "epoch": 3.8214373615555006, + "grad_norm": 0.9359120726585388, + "learning_rate": 1.4720918814476234e-06, + "loss": 0.5878, + "step": 10352 + }, + { + "epoch": 3.821806546886537, + "grad_norm": 0.934124231338501, + "learning_rate": 1.4716485401732971e-06, + "loss": 0.6029, + "step": 10353 + }, + { + "epoch": 3.8221757322175733, + "grad_norm": 0.9415788650512695, + "learning_rate": 1.471205237821749e-06, + "loss": 0.5805, + "step": 10354 + }, + { + "epoch": 3.8225449175486093, + "grad_norm": 0.9575218558311462, + "learning_rate": 1.4707619744097578e-06, + "loss": 0.5898, + "step": 10355 + }, + { + "epoch": 3.8229141028796456, + "grad_norm": 0.9282961487770081, + "learning_rate": 1.4703187499541012e-06, + "loss": 0.6066, + "step": 10356 + }, + { + "epoch": 3.823283288210682, + "grad_norm": 0.9195553660392761, + "learning_rate": 1.4698755644715544e-06, + "loss": 0.5979, + "step": 10357 + }, + { + "epoch": 3.823652473541718, + "grad_norm": 0.9118049740791321, + "learning_rate": 1.4694324179788933e-06, + "loss": 0.5627, + "step": 10358 + }, + { + "epoch": 3.8240216588727542, + "grad_norm": 0.9279400110244751, + "learning_rate": 1.468989310492889e-06, + "loss": 0.5953, + "step": 10359 + }, + { + "epoch": 3.82439084420379, + "grad_norm": 0.9276732206344604, + "learning_rate": 1.4685462420303145e-06, + "loss": 0.5803, + "step": 10360 + }, + { + "epoch": 3.8247600295348265, + "grad_norm": 0.9256722927093506, + "learning_rate": 1.468103212607939e-06, + "loss": 0.5814, + "step": 10361 + }, + { + "epoch": 3.8251292148658624, + "grad_norm": 0.927947998046875, + "learning_rate": 1.4676602222425304e-06, + "loss": 0.583, + "step": 10362 + }, + { + "epoch": 3.825498400196899, + "grad_norm": 0.9467132687568665, + "learning_rate": 1.4672172709508579e-06, + "loss": 0.5841, + "step": 10363 + }, + { + "epoch": 3.825867585527935, + "grad_norm": 0.9403741359710693, + "learning_rate": 1.466774358749685e-06, + "loss": 0.5966, + "step": 10364 + }, + { + "epoch": 3.826236770858971, + "grad_norm": 0.9431484341621399, + "learning_rate": 1.466331485655777e-06, + "loss": 0.6149, + "step": 10365 + }, + { + "epoch": 3.8266059561900074, + "grad_norm": 0.9449676871299744, + "learning_rate": 1.4658886516858956e-06, + "loss": 0.5925, + "step": 10366 + }, + { + "epoch": 3.826975141521044, + "grad_norm": 0.9439797401428223, + "learning_rate": 1.4654458568568025e-06, + "loss": 0.5901, + "step": 10367 + }, + { + "epoch": 3.8273443268520797, + "grad_norm": 0.9239997267723083, + "learning_rate": 1.4650031011852574e-06, + "loss": 0.5537, + "step": 10368 + }, + { + "epoch": 3.827713512183116, + "grad_norm": 0.9284486174583435, + "learning_rate": 1.464560384688018e-06, + "loss": 0.5817, + "step": 10369 + }, + { + "epoch": 3.828082697514152, + "grad_norm": 0.9472487568855286, + "learning_rate": 1.4641177073818419e-06, + "loss": 0.5721, + "step": 10370 + }, + { + "epoch": 3.8284518828451883, + "grad_norm": 0.9383425712585449, + "learning_rate": 1.4636750692834828e-06, + "loss": 0.6067, + "step": 10371 + }, + { + "epoch": 3.8288210681762243, + "grad_norm": 0.9006226658821106, + "learning_rate": 1.463232470409695e-06, + "loss": 0.5496, + "step": 10372 + }, + { + "epoch": 3.8291902535072606, + "grad_norm": 0.9894443154335022, + "learning_rate": 1.462789910777232e-06, + "loss": 0.6262, + "step": 10373 + }, + { + "epoch": 3.829559438838297, + "grad_norm": 0.9558460116386414, + "learning_rate": 1.4623473904028429e-06, + "loss": 0.579, + "step": 10374 + }, + { + "epoch": 3.829928624169333, + "grad_norm": 0.9186485409736633, + "learning_rate": 1.461904909303278e-06, + "loss": 0.5686, + "step": 10375 + }, + { + "epoch": 3.8302978095003692, + "grad_norm": 0.9361640810966492, + "learning_rate": 1.4614624674952843e-06, + "loss": 0.5775, + "step": 10376 + }, + { + "epoch": 3.8306669948314056, + "grad_norm": 0.9479633569717407, + "learning_rate": 1.4610200649956085e-06, + "loss": 0.5886, + "step": 10377 + }, + { + "epoch": 3.8310361801624415, + "grad_norm": 0.9631937146186829, + "learning_rate": 1.4605777018209955e-06, + "loss": 0.5939, + "step": 10378 + }, + { + "epoch": 3.831405365493478, + "grad_norm": 0.9254176616668701, + "learning_rate": 1.4601353779881884e-06, + "loss": 0.5746, + "step": 10379 + }, + { + "epoch": 3.831774550824514, + "grad_norm": 0.9110586643218994, + "learning_rate": 1.4596930935139281e-06, + "loss": 0.5723, + "step": 10380 + }, + { + "epoch": 3.83214373615555, + "grad_norm": 0.969508945941925, + "learning_rate": 1.4592508484149578e-06, + "loss": 0.6324, + "step": 10381 + }, + { + "epoch": 3.832512921486586, + "grad_norm": 0.9079776406288147, + "learning_rate": 1.4588086427080127e-06, + "loss": 0.5834, + "step": 10382 + }, + { + "epoch": 3.8328821068176224, + "grad_norm": 0.9058175683021545, + "learning_rate": 1.458366476409833e-06, + "loss": 0.5883, + "step": 10383 + }, + { + "epoch": 3.833251292148659, + "grad_norm": 0.9026573300361633, + "learning_rate": 1.4579243495371531e-06, + "loss": 0.5604, + "step": 10384 + }, + { + "epoch": 3.8336204774796947, + "grad_norm": 0.9381481409072876, + "learning_rate": 1.457482262106707e-06, + "loss": 0.5701, + "step": 10385 + }, + { + "epoch": 3.833989662810731, + "grad_norm": 0.959997296333313, + "learning_rate": 1.4570402141352292e-06, + "loss": 0.5925, + "step": 10386 + }, + { + "epoch": 3.8343588481417674, + "grad_norm": 0.904129683971405, + "learning_rate": 1.4565982056394492e-06, + "loss": 0.5875, + "step": 10387 + }, + { + "epoch": 3.8347280334728033, + "grad_norm": 0.9288750886917114, + "learning_rate": 1.4561562366360987e-06, + "loss": 0.5785, + "step": 10388 + }, + { + "epoch": 3.8350972188038397, + "grad_norm": 0.9281448125839233, + "learning_rate": 1.4557143071419053e-06, + "loss": 0.5639, + "step": 10389 + }, + { + "epoch": 3.8354664041348756, + "grad_norm": 0.908295214176178, + "learning_rate": 1.4552724171735946e-06, + "loss": 0.5564, + "step": 10390 + }, + { + "epoch": 3.835835589465912, + "grad_norm": 0.9632132053375244, + "learning_rate": 1.4548305667478947e-06, + "loss": 0.6122, + "step": 10391 + }, + { + "epoch": 3.836204774796948, + "grad_norm": 0.9150925874710083, + "learning_rate": 1.4543887558815273e-06, + "loss": 0.6203, + "step": 10392 + }, + { + "epoch": 3.8365739601279842, + "grad_norm": 0.935650110244751, + "learning_rate": 1.4539469845912158e-06, + "loss": 0.5801, + "step": 10393 + }, + { + "epoch": 3.8369431454590206, + "grad_norm": 0.9376907348632812, + "learning_rate": 1.45350525289368e-06, + "loss": 0.5913, + "step": 10394 + }, + { + "epoch": 3.8373123307900565, + "grad_norm": 0.9225543737411499, + "learning_rate": 1.4530635608056398e-06, + "loss": 0.5891, + "step": 10395 + }, + { + "epoch": 3.837681516121093, + "grad_norm": 0.9487529397010803, + "learning_rate": 1.4526219083438153e-06, + "loss": 0.6001, + "step": 10396 + }, + { + "epoch": 3.8380507014521292, + "grad_norm": 0.9127880930900574, + "learning_rate": 1.4521802955249193e-06, + "loss": 0.5693, + "step": 10397 + }, + { + "epoch": 3.838419886783165, + "grad_norm": 0.9218186736106873, + "learning_rate": 1.4517387223656693e-06, + "loss": 0.5555, + "step": 10398 + }, + { + "epoch": 3.838789072114201, + "grad_norm": 1.1543201208114624, + "learning_rate": 1.4512971888827768e-06, + "loss": 0.6117, + "step": 10399 + }, + { + "epoch": 3.8391582574452374, + "grad_norm": 0.9228237867355347, + "learning_rate": 1.4508556950929558e-06, + "loss": 0.6088, + "step": 10400 + }, + { + "epoch": 3.839527442776274, + "grad_norm": 0.933327853679657, + "learning_rate": 1.4504142410129158e-06, + "loss": 0.564, + "step": 10401 + }, + { + "epoch": 3.8398966281073097, + "grad_norm": 0.9235564470291138, + "learning_rate": 1.449972826659364e-06, + "loss": 0.5609, + "step": 10402 + }, + { + "epoch": 3.840265813438346, + "grad_norm": 0.914630651473999, + "learning_rate": 1.4495314520490106e-06, + "loss": 0.5765, + "step": 10403 + }, + { + "epoch": 3.8406349987693824, + "grad_norm": 0.9435351490974426, + "learning_rate": 1.4490901171985605e-06, + "loss": 0.5906, + "step": 10404 + }, + { + "epoch": 3.8410041841004183, + "grad_norm": 0.9545471668243408, + "learning_rate": 1.4486488221247163e-06, + "loss": 0.6018, + "step": 10405 + }, + { + "epoch": 3.8413733694314547, + "grad_norm": 0.9232897758483887, + "learning_rate": 1.4482075668441836e-06, + "loss": 0.5815, + "step": 10406 + }, + { + "epoch": 3.841742554762491, + "grad_norm": 0.943171501159668, + "learning_rate": 1.4477663513736618e-06, + "loss": 0.6318, + "step": 10407 + }, + { + "epoch": 3.842111740093527, + "grad_norm": 0.9269242286682129, + "learning_rate": 1.4473251757298521e-06, + "loss": 0.6075, + "step": 10408 + }, + { + "epoch": 3.842480925424563, + "grad_norm": 0.9177525043487549, + "learning_rate": 1.4468840399294523e-06, + "loss": 0.567, + "step": 10409 + }, + { + "epoch": 3.8428501107555992, + "grad_norm": 0.928290843963623, + "learning_rate": 1.4464429439891584e-06, + "loss": 0.6024, + "step": 10410 + }, + { + "epoch": 3.8432192960866356, + "grad_norm": 0.9475289583206177, + "learning_rate": 1.4460018879256676e-06, + "loss": 0.5772, + "step": 10411 + }, + { + "epoch": 3.8435884814176715, + "grad_norm": 0.9153943657875061, + "learning_rate": 1.4455608717556724e-06, + "loss": 0.5739, + "step": 10412 + }, + { + "epoch": 3.843957666748708, + "grad_norm": 0.9480446577072144, + "learning_rate": 1.4451198954958656e-06, + "loss": 0.628, + "step": 10413 + }, + { + "epoch": 3.8443268520797442, + "grad_norm": 0.9056901931762695, + "learning_rate": 1.444678959162937e-06, + "loss": 0.5836, + "step": 10414 + }, + { + "epoch": 3.84469603741078, + "grad_norm": 0.9463080763816833, + "learning_rate": 1.4442380627735764e-06, + "loss": 0.5868, + "step": 10415 + }, + { + "epoch": 3.8450652227418165, + "grad_norm": 0.942337691783905, + "learning_rate": 1.4437972063444744e-06, + "loss": 0.5977, + "step": 10416 + }, + { + "epoch": 3.8454344080728524, + "grad_norm": 0.8932373523712158, + "learning_rate": 1.4433563898923125e-06, + "loss": 0.6002, + "step": 10417 + }, + { + "epoch": 3.8458035934038888, + "grad_norm": 0.9163549542427063, + "learning_rate": 1.4429156134337776e-06, + "loss": 0.5825, + "step": 10418 + }, + { + "epoch": 3.8461727787349247, + "grad_norm": 0.9306100606918335, + "learning_rate": 1.4424748769855543e-06, + "loss": 0.6075, + "step": 10419 + }, + { + "epoch": 3.846541964065961, + "grad_norm": 0.9340976476669312, + "learning_rate": 1.4420341805643232e-06, + "loss": 0.6026, + "step": 10420 + }, + { + "epoch": 3.8469111493969974, + "grad_norm": 0.9484961032867432, + "learning_rate": 1.4415935241867643e-06, + "loss": 0.5856, + "step": 10421 + }, + { + "epoch": 3.8472803347280333, + "grad_norm": 0.9460262656211853, + "learning_rate": 1.4411529078695558e-06, + "loss": 0.5916, + "step": 10422 + }, + { + "epoch": 3.8476495200590697, + "grad_norm": 0.9910892248153687, + "learning_rate": 1.440712331629375e-06, + "loss": 0.5951, + "step": 10423 + }, + { + "epoch": 3.848018705390106, + "grad_norm": 0.9425603151321411, + "learning_rate": 1.4402717954829003e-06, + "loss": 0.6019, + "step": 10424 + }, + { + "epoch": 3.848387890721142, + "grad_norm": 0.9169489741325378, + "learning_rate": 1.4398312994468021e-06, + "loss": 0.5777, + "step": 10425 + }, + { + "epoch": 3.8487570760521783, + "grad_norm": 0.9110846519470215, + "learning_rate": 1.4393908435377552e-06, + "loss": 0.5913, + "step": 10426 + }, + { + "epoch": 3.8491262613832142, + "grad_norm": 0.9280432462692261, + "learning_rate": 1.4389504277724292e-06, + "loss": 0.6194, + "step": 10427 + }, + { + "epoch": 3.8494954467142506, + "grad_norm": 0.9534717798233032, + "learning_rate": 1.438510052167496e-06, + "loss": 0.5883, + "step": 10428 + }, + { + "epoch": 3.8498646320452865, + "grad_norm": 0.9633135199546814, + "learning_rate": 1.4380697167396219e-06, + "loss": 0.5841, + "step": 10429 + }, + { + "epoch": 3.850233817376323, + "grad_norm": 0.937170147895813, + "learning_rate": 1.4376294215054732e-06, + "loss": 0.615, + "step": 10430 + }, + { + "epoch": 3.8506030027073592, + "grad_norm": 0.9759715795516968, + "learning_rate": 1.4371891664817162e-06, + "loss": 0.6046, + "step": 10431 + }, + { + "epoch": 3.850972188038395, + "grad_norm": 0.9390041828155518, + "learning_rate": 1.4367489516850141e-06, + "loss": 0.5961, + "step": 10432 + }, + { + "epoch": 3.8513413733694315, + "grad_norm": 0.9300847053527832, + "learning_rate": 1.4363087771320278e-06, + "loss": 0.6037, + "step": 10433 + }, + { + "epoch": 3.851710558700468, + "grad_norm": 0.9472296237945557, + "learning_rate": 1.4358686428394192e-06, + "loss": 0.6401, + "step": 10434 + }, + { + "epoch": 3.8520797440315038, + "grad_norm": 0.9081555008888245, + "learning_rate": 1.4354285488238457e-06, + "loss": 0.593, + "step": 10435 + }, + { + "epoch": 3.85244892936254, + "grad_norm": 0.902951180934906, + "learning_rate": 1.4349884951019676e-06, + "loss": 0.5581, + "step": 10436 + }, + { + "epoch": 3.852818114693576, + "grad_norm": 0.9226285219192505, + "learning_rate": 1.434548481690437e-06, + "loss": 0.5914, + "step": 10437 + }, + { + "epoch": 3.8531873000246124, + "grad_norm": 0.9099683165550232, + "learning_rate": 1.43410850860591e-06, + "loss": 0.5809, + "step": 10438 + }, + { + "epoch": 3.8535564853556483, + "grad_norm": 0.9402549266815186, + "learning_rate": 1.4336685758650406e-06, + "loss": 0.5959, + "step": 10439 + }, + { + "epoch": 3.8539256706866847, + "grad_norm": 0.9584069848060608, + "learning_rate": 1.4332286834844792e-06, + "loss": 0.6233, + "step": 10440 + }, + { + "epoch": 3.854294856017721, + "grad_norm": 0.9217606782913208, + "learning_rate": 1.4327888314808753e-06, + "loss": 0.5947, + "step": 10441 + }, + { + "epoch": 3.854664041348757, + "grad_norm": 0.9250696897506714, + "learning_rate": 1.4323490198708762e-06, + "loss": 0.5819, + "step": 10442 + }, + { + "epoch": 3.8550332266797933, + "grad_norm": 0.9498060941696167, + "learning_rate": 1.43190924867113e-06, + "loss": 0.6027, + "step": 10443 + }, + { + "epoch": 3.8554024120108297, + "grad_norm": 0.945257306098938, + "learning_rate": 1.4314695178982836e-06, + "loss": 0.6415, + "step": 10444 + }, + { + "epoch": 3.8557715973418656, + "grad_norm": 0.9360442757606506, + "learning_rate": 1.4310298275689766e-06, + "loss": 0.5752, + "step": 10445 + }, + { + "epoch": 3.856140782672902, + "grad_norm": 0.9610320925712585, + "learning_rate": 1.430590177699854e-06, + "loss": 0.6136, + "step": 10446 + }, + { + "epoch": 3.856509968003938, + "grad_norm": 0.9356186985969543, + "learning_rate": 1.430150568307555e-06, + "loss": 0.601, + "step": 10447 + }, + { + "epoch": 3.856879153334974, + "grad_norm": 0.9316152334213257, + "learning_rate": 1.4297109994087205e-06, + "loss": 0.6289, + "step": 10448 + }, + { + "epoch": 3.85724833866601, + "grad_norm": 0.9421979188919067, + "learning_rate": 1.4292714710199866e-06, + "loss": 0.613, + "step": 10449 + }, + { + "epoch": 3.8576175239970465, + "grad_norm": 0.9553661942481995, + "learning_rate": 1.4288319831579882e-06, + "loss": 0.5908, + "step": 10450 + }, + { + "epoch": 3.857986709328083, + "grad_norm": 0.9196453094482422, + "learning_rate": 1.428392535839363e-06, + "loss": 0.6323, + "step": 10451 + }, + { + "epoch": 3.8583558946591188, + "grad_norm": 0.9353943467140198, + "learning_rate": 1.4279531290807416e-06, + "loss": 0.5957, + "step": 10452 + }, + { + "epoch": 3.858725079990155, + "grad_norm": 0.9113935232162476, + "learning_rate": 1.4275137628987552e-06, + "loss": 0.5731, + "step": 10453 + }, + { + "epoch": 3.8590942653211915, + "grad_norm": 0.9272052645683289, + "learning_rate": 1.4270744373100354e-06, + "loss": 0.6018, + "step": 10454 + }, + { + "epoch": 3.8594634506522274, + "grad_norm": 0.9237297177314758, + "learning_rate": 1.4266351523312084e-06, + "loss": 0.5323, + "step": 10455 + }, + { + "epoch": 3.8598326359832638, + "grad_norm": 0.9918975234031677, + "learning_rate": 1.426195907978903e-06, + "loss": 0.5757, + "step": 10456 + }, + { + "epoch": 3.8602018213142997, + "grad_norm": 0.9904982447624207, + "learning_rate": 1.4257567042697434e-06, + "loss": 0.5868, + "step": 10457 + }, + { + "epoch": 3.860571006645336, + "grad_norm": 0.9119181036949158, + "learning_rate": 1.425317541220353e-06, + "loss": 0.5424, + "step": 10458 + }, + { + "epoch": 3.860940191976372, + "grad_norm": 0.9576665163040161, + "learning_rate": 1.4248784188473552e-06, + "loss": 0.5937, + "step": 10459 + }, + { + "epoch": 3.8613093773074083, + "grad_norm": 0.9358442425727844, + "learning_rate": 1.42443933716737e-06, + "loss": 0.607, + "step": 10460 + }, + { + "epoch": 3.8616785626384447, + "grad_norm": 0.9456491470336914, + "learning_rate": 1.4240002961970157e-06, + "loss": 0.5674, + "step": 10461 + }, + { + "epoch": 3.8620477479694806, + "grad_norm": 0.9257463216781616, + "learning_rate": 1.4235612959529111e-06, + "loss": 0.577, + "step": 10462 + }, + { + "epoch": 3.862416933300517, + "grad_norm": 0.9200669527053833, + "learning_rate": 1.4231223364516711e-06, + "loss": 0.5698, + "step": 10463 + }, + { + "epoch": 3.8627861186315533, + "grad_norm": 0.93698650598526, + "learning_rate": 1.4226834177099125e-06, + "loss": 0.5882, + "step": 10464 + }, + { + "epoch": 3.863155303962589, + "grad_norm": 0.9210671782493591, + "learning_rate": 1.4222445397442447e-06, + "loss": 0.5901, + "step": 10465 + }, + { + "epoch": 3.863524489293625, + "grad_norm": 0.9001872539520264, + "learning_rate": 1.4218057025712812e-06, + "loss": 0.5508, + "step": 10466 + }, + { + "epoch": 3.8638936746246615, + "grad_norm": 0.9515374302864075, + "learning_rate": 1.4213669062076324e-06, + "loss": 0.595, + "step": 10467 + }, + { + "epoch": 3.864262859955698, + "grad_norm": 0.9638783931732178, + "learning_rate": 1.4209281506699055e-06, + "loss": 0.59, + "step": 10468 + }, + { + "epoch": 3.8646320452867338, + "grad_norm": 0.942181408405304, + "learning_rate": 1.4204894359747078e-06, + "loss": 0.5896, + "step": 10469 + }, + { + "epoch": 3.86500123061777, + "grad_norm": 0.9277399778366089, + "learning_rate": 1.420050762138643e-06, + "loss": 0.604, + "step": 10470 + }, + { + "epoch": 3.8653704159488065, + "grad_norm": 0.9317336678504944, + "learning_rate": 1.4196121291783163e-06, + "loss": 0.5615, + "step": 10471 + }, + { + "epoch": 3.8657396012798424, + "grad_norm": 0.9809306859970093, + "learning_rate": 1.4191735371103315e-06, + "loss": 0.5777, + "step": 10472 + }, + { + "epoch": 3.8661087866108788, + "grad_norm": 0.9364567995071411, + "learning_rate": 1.418734985951285e-06, + "loss": 0.5805, + "step": 10473 + }, + { + "epoch": 3.866477971941915, + "grad_norm": 0.9068728089332581, + "learning_rate": 1.4182964757177792e-06, + "loss": 0.5777, + "step": 10474 + }, + { + "epoch": 3.866847157272951, + "grad_norm": 0.9185603857040405, + "learning_rate": 1.4178580064264094e-06, + "loss": 0.5603, + "step": 10475 + }, + { + "epoch": 3.867216342603987, + "grad_norm": 0.9628227353096008, + "learning_rate": 1.4174195780937734e-06, + "loss": 0.6212, + "step": 10476 + }, + { + "epoch": 3.8675855279350233, + "grad_norm": 0.9163797497749329, + "learning_rate": 1.416981190736465e-06, + "loss": 0.5885, + "step": 10477 + }, + { + "epoch": 3.8679547132660597, + "grad_norm": 0.9180100560188293, + "learning_rate": 1.4165428443710755e-06, + "loss": 0.5486, + "step": 10478 + }, + { + "epoch": 3.8683238985970956, + "grad_norm": 0.9352014064788818, + "learning_rate": 1.4161045390141981e-06, + "loss": 0.5755, + "step": 10479 + }, + { + "epoch": 3.868693083928132, + "grad_norm": 0.9227885603904724, + "learning_rate": 1.4156662746824224e-06, + "loss": 0.5964, + "step": 10480 + }, + { + "epoch": 3.8690622692591683, + "grad_norm": 0.9247732758522034, + "learning_rate": 1.4152280513923345e-06, + "loss": 0.5865, + "step": 10481 + }, + { + "epoch": 3.869431454590204, + "grad_norm": 0.9288510680198669, + "learning_rate": 1.414789869160524e-06, + "loss": 0.5933, + "step": 10482 + }, + { + "epoch": 3.8698006399212406, + "grad_norm": 0.9325153231620789, + "learning_rate": 1.414351728003573e-06, + "loss": 0.6037, + "step": 10483 + }, + { + "epoch": 3.8701698252522765, + "grad_norm": 0.9101933240890503, + "learning_rate": 1.4139136279380686e-06, + "loss": 0.5648, + "step": 10484 + }, + { + "epoch": 3.870539010583313, + "grad_norm": 0.914971649646759, + "learning_rate": 1.4134755689805884e-06, + "loss": 0.5788, + "step": 10485 + }, + { + "epoch": 3.8709081959143488, + "grad_norm": 0.9628584980964661, + "learning_rate": 1.4130375511477154e-06, + "loss": 0.5995, + "step": 10486 + }, + { + "epoch": 3.871277381245385, + "grad_norm": 0.9182320833206177, + "learning_rate": 1.412599574456029e-06, + "loss": 0.5605, + "step": 10487 + }, + { + "epoch": 3.8716465665764215, + "grad_norm": 0.9030892848968506, + "learning_rate": 1.4121616389221055e-06, + "loss": 0.5903, + "step": 10488 + }, + { + "epoch": 3.8720157519074574, + "grad_norm": 0.9157236814498901, + "learning_rate": 1.4117237445625194e-06, + "loss": 0.5742, + "step": 10489 + }, + { + "epoch": 3.8723849372384938, + "grad_norm": 0.947378933429718, + "learning_rate": 1.4112858913938471e-06, + "loss": 0.6021, + "step": 10490 + }, + { + "epoch": 3.87275412256953, + "grad_norm": 0.9651888012886047, + "learning_rate": 1.4108480794326595e-06, + "loss": 0.6086, + "step": 10491 + }, + { + "epoch": 3.873123307900566, + "grad_norm": 0.9618120193481445, + "learning_rate": 1.41041030869553e-06, + "loss": 0.6206, + "step": 10492 + }, + { + "epoch": 3.8734924932316024, + "grad_norm": 0.9707621335983276, + "learning_rate": 1.4099725791990248e-06, + "loss": 0.5977, + "step": 10493 + }, + { + "epoch": 3.8738616785626383, + "grad_norm": 0.9471523761749268, + "learning_rate": 1.4095348909597135e-06, + "loss": 0.589, + "step": 10494 + }, + { + "epoch": 3.8742308638936747, + "grad_norm": 0.9420132040977478, + "learning_rate": 1.409097243994163e-06, + "loss": 0.6069, + "step": 10495 + }, + { + "epoch": 3.8746000492247106, + "grad_norm": 0.9648069143295288, + "learning_rate": 1.4086596383189376e-06, + "loss": 0.5832, + "step": 10496 + }, + { + "epoch": 3.874969234555747, + "grad_norm": 0.9185773134231567, + "learning_rate": 1.4082220739506006e-06, + "loss": 0.5934, + "step": 10497 + }, + { + "epoch": 3.8753384198867833, + "grad_norm": 0.9297199845314026, + "learning_rate": 1.4077845509057126e-06, + "loss": 0.6053, + "step": 10498 + }, + { + "epoch": 3.875707605217819, + "grad_norm": 0.9281538128852844, + "learning_rate": 1.4073470692008356e-06, + "loss": 0.5748, + "step": 10499 + }, + { + "epoch": 3.8760767905488556, + "grad_norm": 0.9157599210739136, + "learning_rate": 1.4069096288525268e-06, + "loss": 0.5877, + "step": 10500 + }, + { + "epoch": 3.876445975879892, + "grad_norm": 0.9371123909950256, + "learning_rate": 1.4064722298773431e-06, + "loss": 0.6032, + "step": 10501 + }, + { + "epoch": 3.876815161210928, + "grad_norm": 0.9509853720664978, + "learning_rate": 1.4060348722918412e-06, + "loss": 0.5697, + "step": 10502 + }, + { + "epoch": 3.877184346541964, + "grad_norm": 0.9030172228813171, + "learning_rate": 1.4055975561125732e-06, + "loss": 0.5727, + "step": 10503 + }, + { + "epoch": 3.877553531873, + "grad_norm": 0.8888303637504578, + "learning_rate": 1.4051602813560932e-06, + "loss": 0.5393, + "step": 10504 + }, + { + "epoch": 3.8779227172040365, + "grad_norm": 0.9433321356773376, + "learning_rate": 1.4047230480389509e-06, + "loss": 0.5943, + "step": 10505 + }, + { + "epoch": 3.8782919025350724, + "grad_norm": 0.9547404646873474, + "learning_rate": 1.4042858561776946e-06, + "loss": 0.5717, + "step": 10506 + }, + { + "epoch": 3.8786610878661087, + "grad_norm": 0.9097065925598145, + "learning_rate": 1.403848705788874e-06, + "loss": 0.573, + "step": 10507 + }, + { + "epoch": 3.879030273197145, + "grad_norm": 0.9424798488616943, + "learning_rate": 1.4034115968890338e-06, + "loss": 0.5892, + "step": 10508 + }, + { + "epoch": 3.879399458528181, + "grad_norm": 0.911952018737793, + "learning_rate": 1.4029745294947178e-06, + "loss": 0.5863, + "step": 10509 + }, + { + "epoch": 3.8797686438592174, + "grad_norm": 0.9727878570556641, + "learning_rate": 1.4025375036224702e-06, + "loss": 0.6508, + "step": 10510 + }, + { + "epoch": 3.8801378291902537, + "grad_norm": 0.9297665357589722, + "learning_rate": 1.402100519288831e-06, + "loss": 0.6011, + "step": 10511 + }, + { + "epoch": 3.8805070145212897, + "grad_norm": 0.941654622554779, + "learning_rate": 1.4016635765103425e-06, + "loss": 0.5547, + "step": 10512 + }, + { + "epoch": 3.880876199852326, + "grad_norm": 0.9328834414482117, + "learning_rate": 1.4012266753035392e-06, + "loss": 0.5966, + "step": 10513 + }, + { + "epoch": 3.881245385183362, + "grad_norm": 0.9527391195297241, + "learning_rate": 1.4007898156849595e-06, + "loss": 0.5715, + "step": 10514 + }, + { + "epoch": 3.8816145705143983, + "grad_norm": 0.9604103565216064, + "learning_rate": 1.4003529976711392e-06, + "loss": 0.5949, + "step": 10515 + }, + { + "epoch": 3.881983755845434, + "grad_norm": 0.9671992063522339, + "learning_rate": 1.399916221278611e-06, + "loss": 0.6442, + "step": 10516 + }, + { + "epoch": 3.8823529411764706, + "grad_norm": 0.9346907734870911, + "learning_rate": 1.3994794865239064e-06, + "loss": 0.6145, + "step": 10517 + }, + { + "epoch": 3.882722126507507, + "grad_norm": 0.9331640601158142, + "learning_rate": 1.399042793423555e-06, + "loss": 0.5974, + "step": 10518 + }, + { + "epoch": 3.883091311838543, + "grad_norm": 0.9252318739891052, + "learning_rate": 1.3986061419940874e-06, + "loss": 0.6004, + "step": 10519 + }, + { + "epoch": 3.883460497169579, + "grad_norm": 0.9581965804100037, + "learning_rate": 1.39816953225203e-06, + "loss": 0.593, + "step": 10520 + }, + { + "epoch": 3.8838296825006156, + "grad_norm": 0.9666464924812317, + "learning_rate": 1.3977329642139069e-06, + "loss": 0.5857, + "step": 10521 + }, + { + "epoch": 3.8841988678316515, + "grad_norm": 0.9473251700401306, + "learning_rate": 1.397296437896244e-06, + "loss": 0.5976, + "step": 10522 + }, + { + "epoch": 3.884568053162688, + "grad_norm": 0.976018488407135, + "learning_rate": 1.396859953315562e-06, + "loss": 0.5416, + "step": 10523 + }, + { + "epoch": 3.8849372384937237, + "grad_norm": 0.9218254685401917, + "learning_rate": 1.3964235104883838e-06, + "loss": 0.6016, + "step": 10524 + }, + { + "epoch": 3.88530642382476, + "grad_norm": 0.9276629090309143, + "learning_rate": 1.3959871094312272e-06, + "loss": 0.5725, + "step": 10525 + }, + { + "epoch": 3.885675609155796, + "grad_norm": 0.9332146048545837, + "learning_rate": 1.3955507501606092e-06, + "loss": 0.5939, + "step": 10526 + }, + { + "epoch": 3.8860447944868324, + "grad_norm": 0.9295245409011841, + "learning_rate": 1.3951144326930477e-06, + "loss": 0.5972, + "step": 10527 + }, + { + "epoch": 3.8864139798178687, + "grad_norm": 0.9273895025253296, + "learning_rate": 1.3946781570450563e-06, + "loss": 0.6212, + "step": 10528 + }, + { + "epoch": 3.8867831651489047, + "grad_norm": 0.9367389678955078, + "learning_rate": 1.3942419232331467e-06, + "loss": 0.5833, + "step": 10529 + }, + { + "epoch": 3.887152350479941, + "grad_norm": 0.943713366985321, + "learning_rate": 1.3938057312738322e-06, + "loss": 0.6106, + "step": 10530 + }, + { + "epoch": 3.8875215358109774, + "grad_norm": 0.9543384909629822, + "learning_rate": 1.393369581183621e-06, + "loss": 0.5505, + "step": 10531 + }, + { + "epoch": 3.8878907211420133, + "grad_norm": 0.9356694221496582, + "learning_rate": 1.3929334729790228e-06, + "loss": 0.5963, + "step": 10532 + }, + { + "epoch": 3.888259906473049, + "grad_norm": 0.9521318078041077, + "learning_rate": 1.392497406676543e-06, + "loss": 0.5932, + "step": 10533 + }, + { + "epoch": 3.8886290918040856, + "grad_norm": 0.9336168169975281, + "learning_rate": 1.392061382292686e-06, + "loss": 0.5803, + "step": 10534 + }, + { + "epoch": 3.888998277135122, + "grad_norm": 0.9368258118629456, + "learning_rate": 1.3916253998439572e-06, + "loss": 0.6, + "step": 10535 + }, + { + "epoch": 3.889367462466158, + "grad_norm": 0.9416333436965942, + "learning_rate": 1.3911894593468567e-06, + "loss": 0.6172, + "step": 10536 + }, + { + "epoch": 3.889736647797194, + "grad_norm": 0.9271630048751831, + "learning_rate": 1.3907535608178846e-06, + "loss": 0.5934, + "step": 10537 + }, + { + "epoch": 3.8901058331282306, + "grad_norm": 0.9456373453140259, + "learning_rate": 1.3903177042735411e-06, + "loss": 0.5993, + "step": 10538 + }, + { + "epoch": 3.8904750184592665, + "grad_norm": 0.932845950126648, + "learning_rate": 1.3898818897303212e-06, + "loss": 0.624, + "step": 10539 + }, + { + "epoch": 3.890844203790303, + "grad_norm": 0.9562900066375732, + "learning_rate": 1.3894461172047235e-06, + "loss": 0.6098, + "step": 10540 + }, + { + "epoch": 3.891213389121339, + "grad_norm": 0.9130735397338867, + "learning_rate": 1.3890103867132377e-06, + "loss": 0.5775, + "step": 10541 + }, + { + "epoch": 3.891582574452375, + "grad_norm": 0.9060984253883362, + "learning_rate": 1.388574698272358e-06, + "loss": 0.59, + "step": 10542 + }, + { + "epoch": 3.891951759783411, + "grad_norm": 0.9360846281051636, + "learning_rate": 1.3881390518985766e-06, + "loss": 0.6135, + "step": 10543 + }, + { + "epoch": 3.8923209451144474, + "grad_norm": 0.9063795208930969, + "learning_rate": 1.3877034476083805e-06, + "loss": 0.607, + "step": 10544 + }, + { + "epoch": 3.8926901304454837, + "grad_norm": 0.9239111542701721, + "learning_rate": 1.3872678854182586e-06, + "loss": 0.5929, + "step": 10545 + }, + { + "epoch": 3.8930593157765196, + "grad_norm": 0.9382631182670593, + "learning_rate": 1.3868323653446948e-06, + "loss": 0.5843, + "step": 10546 + }, + { + "epoch": 3.893428501107556, + "grad_norm": 0.9430416822433472, + "learning_rate": 1.3863968874041755e-06, + "loss": 0.5917, + "step": 10547 + }, + { + "epoch": 3.8937976864385924, + "grad_norm": 0.9548263549804688, + "learning_rate": 1.3859614516131826e-06, + "loss": 0.6001, + "step": 10548 + }, + { + "epoch": 3.8941668717696283, + "grad_norm": 0.9242935180664062, + "learning_rate": 1.3855260579881964e-06, + "loss": 0.6026, + "step": 10549 + }, + { + "epoch": 3.8945360571006646, + "grad_norm": 0.9565035700798035, + "learning_rate": 1.3850907065456978e-06, + "loss": 0.585, + "step": 10550 + }, + { + "epoch": 3.8949052424317006, + "grad_norm": 1.0072144269943237, + "learning_rate": 1.384655397302163e-06, + "loss": 0.5624, + "step": 10551 + }, + { + "epoch": 3.895274427762737, + "grad_norm": 0.9487705230712891, + "learning_rate": 1.384220130274071e-06, + "loss": 0.5854, + "step": 10552 + }, + { + "epoch": 3.895643613093773, + "grad_norm": 0.9243864417076111, + "learning_rate": 1.3837849054778945e-06, + "loss": 0.5546, + "step": 10553 + }, + { + "epoch": 3.896012798424809, + "grad_norm": 0.959321916103363, + "learning_rate": 1.3833497229301062e-06, + "loss": 0.6017, + "step": 10554 + }, + { + "epoch": 3.8963819837558455, + "grad_norm": 0.9228194952011108, + "learning_rate": 1.3829145826471792e-06, + "loss": 0.5984, + "step": 10555 + }, + { + "epoch": 3.8967511690868815, + "grad_norm": 0.9259408712387085, + "learning_rate": 1.3824794846455831e-06, + "loss": 0.579, + "step": 10556 + }, + { + "epoch": 3.897120354417918, + "grad_norm": 0.9378489851951599, + "learning_rate": 1.3820444289417845e-06, + "loss": 0.6342, + "step": 10557 + }, + { + "epoch": 3.897489539748954, + "grad_norm": 0.9601992964744568, + "learning_rate": 1.3816094155522526e-06, + "loss": 0.6082, + "step": 10558 + }, + { + "epoch": 3.89785872507999, + "grad_norm": 0.9289948344230652, + "learning_rate": 1.3811744444934503e-06, + "loss": 0.5829, + "step": 10559 + }, + { + "epoch": 3.8982279104110265, + "grad_norm": 0.923577070236206, + "learning_rate": 1.3807395157818443e-06, + "loss": 0.5607, + "step": 10560 + }, + { + "epoch": 3.8985970957420624, + "grad_norm": 0.9458911418914795, + "learning_rate": 1.380304629433892e-06, + "loss": 0.6047, + "step": 10561 + }, + { + "epoch": 3.8989662810730987, + "grad_norm": 0.9320870637893677, + "learning_rate": 1.3798697854660565e-06, + "loss": 0.6044, + "step": 10562 + }, + { + "epoch": 3.8993354664041346, + "grad_norm": 0.9465341567993164, + "learning_rate": 1.3794349838947967e-06, + "loss": 0.5765, + "step": 10563 + }, + { + "epoch": 3.899704651735171, + "grad_norm": 0.9270962476730347, + "learning_rate": 1.3790002247365695e-06, + "loss": 0.5739, + "step": 10564 + }, + { + "epoch": 3.9000738370662074, + "grad_norm": 0.9540504813194275, + "learning_rate": 1.3785655080078285e-06, + "loss": 0.5668, + "step": 10565 + }, + { + "epoch": 3.9004430223972433, + "grad_norm": 0.9162954092025757, + "learning_rate": 1.3781308337250307e-06, + "loss": 0.5542, + "step": 10566 + }, + { + "epoch": 3.9008122077282796, + "grad_norm": 0.9516668915748596, + "learning_rate": 1.3776962019046264e-06, + "loss": 0.563, + "step": 10567 + }, + { + "epoch": 3.901181393059316, + "grad_norm": 0.986883819103241, + "learning_rate": 1.3772616125630669e-06, + "loss": 0.5789, + "step": 10568 + }, + { + "epoch": 3.901550578390352, + "grad_norm": 0.9181643128395081, + "learning_rate": 1.3768270657167999e-06, + "loss": 0.6048, + "step": 10569 + }, + { + "epoch": 3.9019197637213883, + "grad_norm": 0.9619367122650146, + "learning_rate": 1.376392561382274e-06, + "loss": 0.5931, + "step": 10570 + }, + { + "epoch": 3.902288949052424, + "grad_norm": 0.9445012807846069, + "learning_rate": 1.3759580995759358e-06, + "loss": 0.5573, + "step": 10571 + }, + { + "epoch": 3.9026581343834605, + "grad_norm": 0.9159151315689087, + "learning_rate": 1.375523680314229e-06, + "loss": 0.552, + "step": 10572 + }, + { + "epoch": 3.9030273197144965, + "grad_norm": 0.93753981590271, + "learning_rate": 1.375089303613596e-06, + "loss": 0.573, + "step": 10573 + }, + { + "epoch": 3.903396505045533, + "grad_norm": 0.9343954920768738, + "learning_rate": 1.3746549694904771e-06, + "loss": 0.5754, + "step": 10574 + }, + { + "epoch": 3.903765690376569, + "grad_norm": 0.9085758328437805, + "learning_rate": 1.3742206779613132e-06, + "loss": 0.5542, + "step": 10575 + }, + { + "epoch": 3.904134875707605, + "grad_norm": 0.9466039538383484, + "learning_rate": 1.3737864290425413e-06, + "loss": 0.5911, + "step": 10576 + }, + { + "epoch": 3.9045040610386414, + "grad_norm": 0.9260059595108032, + "learning_rate": 1.3733522227505963e-06, + "loss": 0.5695, + "step": 10577 + }, + { + "epoch": 3.904873246369678, + "grad_norm": 0.911638081073761, + "learning_rate": 1.3729180591019154e-06, + "loss": 0.5973, + "step": 10578 + }, + { + "epoch": 3.9052424317007137, + "grad_norm": 0.9293588995933533, + "learning_rate": 1.3724839381129296e-06, + "loss": 0.5825, + "step": 10579 + }, + { + "epoch": 3.90561161703175, + "grad_norm": 0.9659173488616943, + "learning_rate": 1.3720498598000714e-06, + "loss": 0.6012, + "step": 10580 + }, + { + "epoch": 3.905980802362786, + "grad_norm": 0.9300582408905029, + "learning_rate": 1.37161582417977e-06, + "loss": 0.5995, + "step": 10581 + }, + { + "epoch": 3.9063499876938224, + "grad_norm": 0.9108884930610657, + "learning_rate": 1.3711818312684526e-06, + "loss": 0.579, + "step": 10582 + }, + { + "epoch": 3.9067191730248583, + "grad_norm": 0.9719651341438293, + "learning_rate": 1.370747881082548e-06, + "loss": 0.5791, + "step": 10583 + }, + { + "epoch": 3.9070883583558946, + "grad_norm": 0.9076600074768066, + "learning_rate": 1.3703139736384797e-06, + "loss": 0.5804, + "step": 10584 + }, + { + "epoch": 3.907457543686931, + "grad_norm": 0.9516551494598389, + "learning_rate": 1.3698801089526694e-06, + "loss": 0.6359, + "step": 10585 + }, + { + "epoch": 3.907826729017967, + "grad_norm": 0.9349496960639954, + "learning_rate": 1.3694462870415415e-06, + "loss": 0.5544, + "step": 10586 + }, + { + "epoch": 3.9081959143490033, + "grad_norm": 0.9567404985427856, + "learning_rate": 1.369012507921515e-06, + "loss": 0.6308, + "step": 10587 + }, + { + "epoch": 3.9085650996800396, + "grad_norm": 0.9463831782341003, + "learning_rate": 1.3685787716090083e-06, + "loss": 0.5812, + "step": 10588 + }, + { + "epoch": 3.9089342850110755, + "grad_norm": 0.9308695793151855, + "learning_rate": 1.3681450781204368e-06, + "loss": 0.5963, + "step": 10589 + }, + { + "epoch": 3.909303470342112, + "grad_norm": 0.9596331715583801, + "learning_rate": 1.3677114274722167e-06, + "loss": 0.6189, + "step": 10590 + }, + { + "epoch": 3.909672655673148, + "grad_norm": 0.9254229068756104, + "learning_rate": 1.3672778196807628e-06, + "loss": 0.5631, + "step": 10591 + }, + { + "epoch": 3.910041841004184, + "grad_norm": 0.9437053799629211, + "learning_rate": 1.3668442547624861e-06, + "loss": 0.5606, + "step": 10592 + }, + { + "epoch": 3.91041102633522, + "grad_norm": 0.9316393733024597, + "learning_rate": 1.3664107327337966e-06, + "loss": 0.5899, + "step": 10593 + }, + { + "epoch": 3.9107802116662564, + "grad_norm": 0.9368863701820374, + "learning_rate": 1.3659772536111022e-06, + "loss": 0.6308, + "step": 10594 + }, + { + "epoch": 3.911149396997293, + "grad_norm": 0.9532739520072937, + "learning_rate": 1.3655438174108115e-06, + "loss": 0.5917, + "step": 10595 + }, + { + "epoch": 3.9115185823283287, + "grad_norm": 0.9505391716957092, + "learning_rate": 1.3651104241493296e-06, + "loss": 0.6078, + "step": 10596 + }, + { + "epoch": 3.911887767659365, + "grad_norm": 0.9480579495429993, + "learning_rate": 1.364677073843059e-06, + "loss": 0.5948, + "step": 10597 + }, + { + "epoch": 3.9122569529904014, + "grad_norm": 0.9349278807640076, + "learning_rate": 1.3642437665084037e-06, + "loss": 0.6139, + "step": 10598 + }, + { + "epoch": 3.9126261383214374, + "grad_norm": 0.9213259220123291, + "learning_rate": 1.3638105021617625e-06, + "loss": 0.5846, + "step": 10599 + }, + { + "epoch": 3.9129953236524733, + "grad_norm": 0.9823776483535767, + "learning_rate": 1.363377280819536e-06, + "loss": 0.5837, + "step": 10600 + }, + { + "epoch": 3.9133645089835096, + "grad_norm": 0.9204944968223572, + "learning_rate": 1.362944102498121e-06, + "loss": 0.5503, + "step": 10601 + }, + { + "epoch": 3.913733694314546, + "grad_norm": 0.920445442199707, + "learning_rate": 1.362510967213912e-06, + "loss": 0.5662, + "step": 10602 + }, + { + "epoch": 3.914102879645582, + "grad_norm": 0.9478007555007935, + "learning_rate": 1.3620778749833045e-06, + "loss": 0.5718, + "step": 10603 + }, + { + "epoch": 3.9144720649766183, + "grad_norm": 0.9022857546806335, + "learning_rate": 1.3616448258226902e-06, + "loss": 0.58, + "step": 10604 + }, + { + "epoch": 3.9148412503076546, + "grad_norm": 0.9251195192337036, + "learning_rate": 1.3612118197484593e-06, + "loss": 0.5643, + "step": 10605 + }, + { + "epoch": 3.9152104356386905, + "grad_norm": 1.0694224834442139, + "learning_rate": 1.3607788567770025e-06, + "loss": 0.5831, + "step": 10606 + }, + { + "epoch": 3.915579620969727, + "grad_norm": 0.925706684589386, + "learning_rate": 1.3603459369247062e-06, + "loss": 0.5696, + "step": 10607 + }, + { + "epoch": 3.9159488063007633, + "grad_norm": 0.9578813314437866, + "learning_rate": 1.3599130602079557e-06, + "loss": 0.5959, + "step": 10608 + }, + { + "epoch": 3.916317991631799, + "grad_norm": 0.9074733257293701, + "learning_rate": 1.3594802266431372e-06, + "loss": 0.5586, + "step": 10609 + }, + { + "epoch": 3.916687176962835, + "grad_norm": 0.9243195056915283, + "learning_rate": 1.359047436246631e-06, + "loss": 0.5639, + "step": 10610 + }, + { + "epoch": 3.9170563622938714, + "grad_norm": 0.9299200773239136, + "learning_rate": 1.3586146890348203e-06, + "loss": 0.6056, + "step": 10611 + }, + { + "epoch": 3.917425547624908, + "grad_norm": 0.9679391384124756, + "learning_rate": 1.3581819850240834e-06, + "loss": 0.5916, + "step": 10612 + }, + { + "epoch": 3.9177947329559437, + "grad_norm": 0.906618058681488, + "learning_rate": 1.3577493242307971e-06, + "loss": 0.5443, + "step": 10613 + }, + { + "epoch": 3.91816391828698, + "grad_norm": 0.927794873714447, + "learning_rate": 1.3573167066713394e-06, + "loss": 0.5666, + "step": 10614 + }, + { + "epoch": 3.9185331036180164, + "grad_norm": 0.9564919471740723, + "learning_rate": 1.3568841323620834e-06, + "loss": 0.5997, + "step": 10615 + }, + { + "epoch": 3.9189022889490523, + "grad_norm": 0.9268122911453247, + "learning_rate": 1.3564516013194023e-06, + "loss": 0.5566, + "step": 10616 + }, + { + "epoch": 3.9192714742800887, + "grad_norm": 0.9367059469223022, + "learning_rate": 1.3560191135596662e-06, + "loss": 0.5626, + "step": 10617 + }, + { + "epoch": 3.9196406596111246, + "grad_norm": 0.9203839302062988, + "learning_rate": 1.3555866690992458e-06, + "loss": 0.5866, + "step": 10618 + }, + { + "epoch": 3.920009844942161, + "grad_norm": 0.9240842461585999, + "learning_rate": 1.35515426795451e-06, + "loss": 0.5903, + "step": 10619 + }, + { + "epoch": 3.920379030273197, + "grad_norm": 0.9213091731071472, + "learning_rate": 1.3547219101418232e-06, + "loss": 0.5773, + "step": 10620 + }, + { + "epoch": 3.9207482156042333, + "grad_norm": 0.9338705539703369, + "learning_rate": 1.354289595677551e-06, + "loss": 0.5917, + "step": 10621 + }, + { + "epoch": 3.9211174009352696, + "grad_norm": 0.9228342771530151, + "learning_rate": 1.3538573245780552e-06, + "loss": 0.5613, + "step": 10622 + }, + { + "epoch": 3.9214865862663055, + "grad_norm": 0.9542357325553894, + "learning_rate": 1.3534250968596984e-06, + "loss": 0.5838, + "step": 10623 + }, + { + "epoch": 3.921855771597342, + "grad_norm": 0.9405385255813599, + "learning_rate": 1.3529929125388401e-06, + "loss": 0.594, + "step": 10624 + }, + { + "epoch": 3.9222249569283782, + "grad_norm": 0.9824245572090149, + "learning_rate": 1.3525607716318373e-06, + "loss": 0.6075, + "step": 10625 + }, + { + "epoch": 3.922594142259414, + "grad_norm": 0.9336971640586853, + "learning_rate": 1.3521286741550479e-06, + "loss": 0.5954, + "step": 10626 + }, + { + "epoch": 3.9229633275904505, + "grad_norm": 0.9389987587928772, + "learning_rate": 1.3516966201248261e-06, + "loss": 0.577, + "step": 10627 + }, + { + "epoch": 3.9233325129214864, + "grad_norm": 0.9354464411735535, + "learning_rate": 1.3512646095575236e-06, + "loss": 0.5693, + "step": 10628 + }, + { + "epoch": 3.923701698252523, + "grad_norm": 0.9143065810203552, + "learning_rate": 1.350832642469494e-06, + "loss": 0.6114, + "step": 10629 + }, + { + "epoch": 3.9240708835835587, + "grad_norm": 0.9163089990615845, + "learning_rate": 1.3504007188770858e-06, + "loss": 0.6128, + "step": 10630 + }, + { + "epoch": 3.924440068914595, + "grad_norm": 0.9220145344734192, + "learning_rate": 1.3499688387966479e-06, + "loss": 0.597, + "step": 10631 + }, + { + "epoch": 3.9248092542456314, + "grad_norm": 0.9048140048980713, + "learning_rate": 1.3495370022445267e-06, + "loss": 0.5265, + "step": 10632 + }, + { + "epoch": 3.9251784395766673, + "grad_norm": 0.9552127718925476, + "learning_rate": 1.349105209237066e-06, + "loss": 0.6066, + "step": 10633 + }, + { + "epoch": 3.9255476249077037, + "grad_norm": 0.9417923092842102, + "learning_rate": 1.3486734597906107e-06, + "loss": 0.5796, + "step": 10634 + }, + { + "epoch": 3.92591681023874, + "grad_norm": 0.9194878935813904, + "learning_rate": 1.3482417539215016e-06, + "loss": 0.5849, + "step": 10635 + }, + { + "epoch": 3.926285995569776, + "grad_norm": 0.9099927544593811, + "learning_rate": 1.3478100916460787e-06, + "loss": 0.581, + "step": 10636 + }, + { + "epoch": 3.9266551809008123, + "grad_norm": 0.8976762294769287, + "learning_rate": 1.3473784729806793e-06, + "loss": 0.5662, + "step": 10637 + }, + { + "epoch": 3.9270243662318483, + "grad_norm": 0.9211485981941223, + "learning_rate": 1.3469468979416405e-06, + "loss": 0.6059, + "step": 10638 + }, + { + "epoch": 3.9273935515628846, + "grad_norm": 0.9516952633857727, + "learning_rate": 1.3465153665452993e-06, + "loss": 0.5685, + "step": 10639 + }, + { + "epoch": 3.9277627368939205, + "grad_norm": 0.930330216884613, + "learning_rate": 1.3460838788079871e-06, + "loss": 0.582, + "step": 10640 + }, + { + "epoch": 3.928131922224957, + "grad_norm": 0.932865560054779, + "learning_rate": 1.345652434746035e-06, + "loss": 0.5633, + "step": 10641 + }, + { + "epoch": 3.9285011075559932, + "grad_norm": 0.9254530072212219, + "learning_rate": 1.3452210343757749e-06, + "loss": 0.5696, + "step": 10642 + }, + { + "epoch": 3.928870292887029, + "grad_norm": 0.9256355166435242, + "learning_rate": 1.3447896777135342e-06, + "loss": 0.5889, + "step": 10643 + }, + { + "epoch": 3.9292394782180655, + "grad_norm": 0.9202930927276611, + "learning_rate": 1.3443583647756397e-06, + "loss": 0.5657, + "step": 10644 + }, + { + "epoch": 3.929608663549102, + "grad_norm": 0.9748367071151733, + "learning_rate": 1.343927095578415e-06, + "loss": 0.6025, + "step": 10645 + }, + { + "epoch": 3.929977848880138, + "grad_norm": 0.924910306930542, + "learning_rate": 1.3434958701381851e-06, + "loss": 0.5482, + "step": 10646 + }, + { + "epoch": 3.930347034211174, + "grad_norm": 0.8950003981590271, + "learning_rate": 1.3430646884712736e-06, + "loss": 0.5667, + "step": 10647 + }, + { + "epoch": 3.93071621954221, + "grad_norm": 0.9856835603713989, + "learning_rate": 1.3426335505939963e-06, + "loss": 0.6101, + "step": 10648 + }, + { + "epoch": 3.9310854048732464, + "grad_norm": 0.9377638101577759, + "learning_rate": 1.342202456522675e-06, + "loss": 0.587, + "step": 10649 + }, + { + "epoch": 3.9314545902042823, + "grad_norm": 0.944389820098877, + "learning_rate": 1.3417714062736245e-06, + "loss": 0.6043, + "step": 10650 + }, + { + "epoch": 3.9318237755353187, + "grad_norm": 0.9431983828544617, + "learning_rate": 1.3413403998631616e-06, + "loss": 0.6024, + "step": 10651 + }, + { + "epoch": 3.932192960866355, + "grad_norm": 0.9450110793113708, + "learning_rate": 1.3409094373075987e-06, + "loss": 0.5869, + "step": 10652 + }, + { + "epoch": 3.932562146197391, + "grad_norm": 0.9287741780281067, + "learning_rate": 1.3404785186232472e-06, + "loss": 0.5917, + "step": 10653 + }, + { + "epoch": 3.9329313315284273, + "grad_norm": 0.9234588742256165, + "learning_rate": 1.340047643826418e-06, + "loss": 0.5768, + "step": 10654 + }, + { + "epoch": 3.9333005168594637, + "grad_norm": 0.9345517158508301, + "learning_rate": 1.3396168129334202e-06, + "loss": 0.5933, + "step": 10655 + }, + { + "epoch": 3.9336697021904996, + "grad_norm": 0.9287716150283813, + "learning_rate": 1.3391860259605585e-06, + "loss": 0.5681, + "step": 10656 + }, + { + "epoch": 3.934038887521536, + "grad_norm": 0.9214622378349304, + "learning_rate": 1.3387552829241405e-06, + "loss": 0.5837, + "step": 10657 + }, + { + "epoch": 3.934408072852572, + "grad_norm": 0.9410375356674194, + "learning_rate": 1.3383245838404675e-06, + "loss": 0.6168, + "step": 10658 + }, + { + "epoch": 3.9347772581836082, + "grad_norm": 0.944462239742279, + "learning_rate": 1.3378939287258432e-06, + "loss": 0.5934, + "step": 10659 + }, + { + "epoch": 3.935146443514644, + "grad_norm": 0.9210761189460754, + "learning_rate": 1.3374633175965671e-06, + "loss": 0.5902, + "step": 10660 + }, + { + "epoch": 3.9355156288456805, + "grad_norm": 0.9268816709518433, + "learning_rate": 1.3370327504689366e-06, + "loss": 0.601, + "step": 10661 + }, + { + "epoch": 3.935884814176717, + "grad_norm": 0.9174844026565552, + "learning_rate": 1.33660222735925e-06, + "loss": 0.6176, + "step": 10662 + }, + { + "epoch": 3.936253999507753, + "grad_norm": 0.9453315138816833, + "learning_rate": 1.3361717482838024e-06, + "loss": 0.6008, + "step": 10663 + }, + { + "epoch": 3.936623184838789, + "grad_norm": 0.9283411502838135, + "learning_rate": 1.3357413132588865e-06, + "loss": 0.6211, + "step": 10664 + }, + { + "epoch": 3.9369923701698255, + "grad_norm": 0.9539561867713928, + "learning_rate": 1.3353109223007936e-06, + "loss": 0.5865, + "step": 10665 + }, + { + "epoch": 3.9373615555008614, + "grad_norm": 0.9328828454017639, + "learning_rate": 1.3348805754258148e-06, + "loss": 0.6021, + "step": 10666 + }, + { + "epoch": 3.9377307408318973, + "grad_norm": 0.9738325476646423, + "learning_rate": 1.3344502726502406e-06, + "loss": 0.5899, + "step": 10667 + }, + { + "epoch": 3.9380999261629337, + "grad_norm": 0.8936073780059814, + "learning_rate": 1.3340200139903534e-06, + "loss": 0.5698, + "step": 10668 + }, + { + "epoch": 3.93846911149397, + "grad_norm": 0.9545515775680542, + "learning_rate": 1.3335897994624419e-06, + "loss": 0.5954, + "step": 10669 + }, + { + "epoch": 3.938838296825006, + "grad_norm": 0.9483173489570618, + "learning_rate": 1.3331596290827873e-06, + "loss": 0.6102, + "step": 10670 + }, + { + "epoch": 3.9392074821560423, + "grad_norm": 0.9314919114112854, + "learning_rate": 1.3327295028676735e-06, + "loss": 0.6028, + "step": 10671 + }, + { + "epoch": 3.9395766674870787, + "grad_norm": 0.9326043725013733, + "learning_rate": 1.3322994208333796e-06, + "loss": 0.5924, + "step": 10672 + }, + { + "epoch": 3.9399458528181146, + "grad_norm": 0.9187272787094116, + "learning_rate": 1.3318693829961832e-06, + "loss": 0.6184, + "step": 10673 + }, + { + "epoch": 3.940315038149151, + "grad_norm": 0.9255833029747009, + "learning_rate": 1.3314393893723615e-06, + "loss": 0.5747, + "step": 10674 + }, + { + "epoch": 3.9406842234801873, + "grad_norm": 0.9335517287254333, + "learning_rate": 1.3310094399781925e-06, + "loss": 0.5777, + "step": 10675 + }, + { + "epoch": 3.9410534088112232, + "grad_norm": 0.9309194087982178, + "learning_rate": 1.3305795348299454e-06, + "loss": 0.5912, + "step": 10676 + }, + { + "epoch": 3.941422594142259, + "grad_norm": 0.9551677107810974, + "learning_rate": 1.3301496739438946e-06, + "loss": 0.5917, + "step": 10677 + }, + { + "epoch": 3.9417917794732955, + "grad_norm": 0.9582645297050476, + "learning_rate": 1.3297198573363084e-06, + "loss": 0.5734, + "step": 10678 + }, + { + "epoch": 3.942160964804332, + "grad_norm": 0.9070813655853271, + "learning_rate": 1.3292900850234574e-06, + "loss": 0.6026, + "step": 10679 + }, + { + "epoch": 3.942530150135368, + "grad_norm": 0.9120562076568604, + "learning_rate": 1.3288603570216074e-06, + "loss": 0.614, + "step": 10680 + }, + { + "epoch": 3.942899335466404, + "grad_norm": 0.9424120783805847, + "learning_rate": 1.328430673347022e-06, + "loss": 0.5642, + "step": 10681 + }, + { + "epoch": 3.9432685207974405, + "grad_norm": 0.903985321521759, + "learning_rate": 1.3280010340159665e-06, + "loss": 0.5454, + "step": 10682 + }, + { + "epoch": 3.9436377061284764, + "grad_norm": 0.9182960391044617, + "learning_rate": 1.3275714390447026e-06, + "loss": 0.5732, + "step": 10683 + }, + { + "epoch": 3.9440068914595128, + "grad_norm": 0.9327577948570251, + "learning_rate": 1.3271418884494885e-06, + "loss": 0.5908, + "step": 10684 + }, + { + "epoch": 3.9443760767905487, + "grad_norm": 0.8940008282661438, + "learning_rate": 1.3267123822465846e-06, + "loss": 0.5545, + "step": 10685 + }, + { + "epoch": 3.944745262121585, + "grad_norm": 0.9080817103385925, + "learning_rate": 1.326282920452246e-06, + "loss": 0.5549, + "step": 10686 + }, + { + "epoch": 3.945114447452621, + "grad_norm": 0.9006867408752441, + "learning_rate": 1.3258535030827302e-06, + "loss": 0.5514, + "step": 10687 + }, + { + "epoch": 3.9454836327836573, + "grad_norm": 0.9737211465835571, + "learning_rate": 1.325424130154287e-06, + "loss": 0.6168, + "step": 10688 + }, + { + "epoch": 3.9458528181146937, + "grad_norm": 0.9545830488204956, + "learning_rate": 1.3249948016831697e-06, + "loss": 0.5942, + "step": 10689 + }, + { + "epoch": 3.9462220034457296, + "grad_norm": 0.9450685977935791, + "learning_rate": 1.324565517685629e-06, + "loss": 0.5731, + "step": 10690 + }, + { + "epoch": 3.946591188776766, + "grad_norm": 0.9225343465805054, + "learning_rate": 1.3241362781779128e-06, + "loss": 0.5866, + "step": 10691 + }, + { + "epoch": 3.9469603741078023, + "grad_norm": 0.9156228303909302, + "learning_rate": 1.3237070831762672e-06, + "loss": 0.5811, + "step": 10692 + }, + { + "epoch": 3.9473295594388382, + "grad_norm": 0.9017273783683777, + "learning_rate": 1.3232779326969365e-06, + "loss": 0.5828, + "step": 10693 + }, + { + "epoch": 3.9476987447698746, + "grad_norm": 0.9146370887756348, + "learning_rate": 1.3228488267561645e-06, + "loss": 0.567, + "step": 10694 + }, + { + "epoch": 3.9480679301009105, + "grad_norm": 0.9282599687576294, + "learning_rate": 1.3224197653701947e-06, + "loss": 0.5816, + "step": 10695 + }, + { + "epoch": 3.948437115431947, + "grad_norm": 0.9358929991722107, + "learning_rate": 1.3219907485552635e-06, + "loss": 0.5978, + "step": 10696 + }, + { + "epoch": 3.948806300762983, + "grad_norm": 0.926510214805603, + "learning_rate": 1.3215617763276112e-06, + "loss": 0.5817, + "step": 10697 + }, + { + "epoch": 3.949175486094019, + "grad_norm": 0.9527725577354431, + "learning_rate": 1.321132848703473e-06, + "loss": 0.6357, + "step": 10698 + }, + { + "epoch": 3.9495446714250555, + "grad_norm": 0.9398474097251892, + "learning_rate": 1.3207039656990855e-06, + "loss": 0.5738, + "step": 10699 + }, + { + "epoch": 3.9499138567560914, + "grad_norm": 0.9367092847824097, + "learning_rate": 1.3202751273306808e-06, + "loss": 0.6065, + "step": 10700 + }, + { + "epoch": 3.9502830420871278, + "grad_norm": 0.9157485961914062, + "learning_rate": 1.319846333614489e-06, + "loss": 0.5474, + "step": 10701 + }, + { + "epoch": 3.950652227418164, + "grad_norm": 0.9203722476959229, + "learning_rate": 1.3194175845667422e-06, + "loss": 0.5812, + "step": 10702 + }, + { + "epoch": 3.9510214127492, + "grad_norm": 0.9441129565238953, + "learning_rate": 1.318988880203667e-06, + "loss": 0.6135, + "step": 10703 + }, + { + "epoch": 3.9513905980802364, + "grad_norm": 0.9503044486045837, + "learning_rate": 1.3185602205414894e-06, + "loss": 0.5925, + "step": 10704 + }, + { + "epoch": 3.9517597834112723, + "grad_norm": 0.9278333187103271, + "learning_rate": 1.318131605596435e-06, + "loss": 0.5861, + "step": 10705 + }, + { + "epoch": 3.9521289687423087, + "grad_norm": 0.9261239171028137, + "learning_rate": 1.3177030353847255e-06, + "loss": 0.6013, + "step": 10706 + }, + { + "epoch": 3.9524981540733446, + "grad_norm": 0.9114288091659546, + "learning_rate": 1.3172745099225854e-06, + "loss": 0.5416, + "step": 10707 + }, + { + "epoch": 3.952867339404381, + "grad_norm": 0.9475899934768677, + "learning_rate": 1.3168460292262292e-06, + "loss": 0.5953, + "step": 10708 + }, + { + "epoch": 3.9532365247354173, + "grad_norm": 0.9270053505897522, + "learning_rate": 1.3164175933118777e-06, + "loss": 0.6067, + "step": 10709 + }, + { + "epoch": 3.9536057100664532, + "grad_norm": 0.9502788186073303, + "learning_rate": 1.3159892021957476e-06, + "loss": 0.6032, + "step": 10710 + }, + { + "epoch": 3.9539748953974896, + "grad_norm": 0.923638105392456, + "learning_rate": 1.3155608558940527e-06, + "loss": 0.5897, + "step": 10711 + }, + { + "epoch": 3.954344080728526, + "grad_norm": 0.9354206323623657, + "learning_rate": 1.3151325544230047e-06, + "loss": 0.6022, + "step": 10712 + }, + { + "epoch": 3.954713266059562, + "grad_norm": 0.942018985748291, + "learning_rate": 1.3147042977988161e-06, + "loss": 0.6048, + "step": 10713 + }, + { + "epoch": 3.955082451390598, + "grad_norm": 0.9153985381126404, + "learning_rate": 1.3142760860376952e-06, + "loss": 0.5676, + "step": 10714 + }, + { + "epoch": 3.955451636721634, + "grad_norm": 0.9192482829093933, + "learning_rate": 1.313847919155852e-06, + "loss": 0.5805, + "step": 10715 + }, + { + "epoch": 3.9558208220526705, + "grad_norm": 0.9331598877906799, + "learning_rate": 1.3134197971694889e-06, + "loss": 0.5863, + "step": 10716 + }, + { + "epoch": 3.9561900073837064, + "grad_norm": 0.9499639868736267, + "learning_rate": 1.3129917200948118e-06, + "loss": 0.6095, + "step": 10717 + }, + { + "epoch": 3.9565591927147428, + "grad_norm": 0.938499927520752, + "learning_rate": 1.3125636879480245e-06, + "loss": 0.57, + "step": 10718 + }, + { + "epoch": 3.956928378045779, + "grad_norm": 0.9417146444320679, + "learning_rate": 1.3121357007453268e-06, + "loss": 0.6088, + "step": 10719 + }, + { + "epoch": 3.957297563376815, + "grad_norm": 0.9072311520576477, + "learning_rate": 1.311707758502918e-06, + "loss": 0.5705, + "step": 10720 + }, + { + "epoch": 3.9576667487078514, + "grad_norm": 0.8882033824920654, + "learning_rate": 1.3112798612369945e-06, + "loss": 0.535, + "step": 10721 + }, + { + "epoch": 3.9580359340388878, + "grad_norm": 0.942417323589325, + "learning_rate": 1.3108520089637532e-06, + "loss": 0.5788, + "step": 10722 + }, + { + "epoch": 3.9584051193699237, + "grad_norm": 0.9583037495613098, + "learning_rate": 1.3104242016993898e-06, + "loss": 0.5705, + "step": 10723 + }, + { + "epoch": 3.95877430470096, + "grad_norm": 0.9197502136230469, + "learning_rate": 1.3099964394600931e-06, + "loss": 0.5978, + "step": 10724 + }, + { + "epoch": 3.959143490031996, + "grad_norm": 0.9429165720939636, + "learning_rate": 1.3095687222620567e-06, + "loss": 0.6078, + "step": 10725 + }, + { + "epoch": 3.9595126753630323, + "grad_norm": 0.9395483136177063, + "learning_rate": 1.3091410501214674e-06, + "loss": 0.5968, + "step": 10726 + }, + { + "epoch": 3.9598818606940682, + "grad_norm": 0.9119990468025208, + "learning_rate": 1.3087134230545144e-06, + "loss": 0.5692, + "step": 10727 + }, + { + "epoch": 3.9602510460251046, + "grad_norm": 0.9663122296333313, + "learning_rate": 1.3082858410773824e-06, + "loss": 0.6239, + "step": 10728 + }, + { + "epoch": 3.960620231356141, + "grad_norm": 0.9532100558280945, + "learning_rate": 1.3078583042062542e-06, + "loss": 0.6032, + "step": 10729 + }, + { + "epoch": 3.960989416687177, + "grad_norm": 0.925879716873169, + "learning_rate": 1.3074308124573137e-06, + "loss": 0.5816, + "step": 10730 + }, + { + "epoch": 3.961358602018213, + "grad_norm": 0.9500490427017212, + "learning_rate": 1.307003365846741e-06, + "loss": 0.6147, + "step": 10731 + }, + { + "epoch": 3.9617277873492496, + "grad_norm": 0.9472479224205017, + "learning_rate": 1.3065759643907134e-06, + "loss": 0.6102, + "step": 10732 + }, + { + "epoch": 3.9620969726802855, + "grad_norm": 0.947428286075592, + "learning_rate": 1.3061486081054095e-06, + "loss": 0.6186, + "step": 10733 + }, + { + "epoch": 3.962466158011322, + "grad_norm": 0.9279646277427673, + "learning_rate": 1.3057212970070032e-06, + "loss": 0.6095, + "step": 10734 + }, + { + "epoch": 3.9628353433423578, + "grad_norm": 0.9334374666213989, + "learning_rate": 1.3052940311116708e-06, + "loss": 0.5789, + "step": 10735 + }, + { + "epoch": 3.963204528673394, + "grad_norm": 0.9112969636917114, + "learning_rate": 1.3048668104355804e-06, + "loss": 0.5607, + "step": 10736 + }, + { + "epoch": 3.96357371400443, + "grad_norm": 0.9359127879142761, + "learning_rate": 1.304439634994904e-06, + "loss": 0.5998, + "step": 10737 + }, + { + "epoch": 3.9639428993354664, + "grad_norm": 0.9186880588531494, + "learning_rate": 1.304012504805811e-06, + "loss": 0.6105, + "step": 10738 + }, + { + "epoch": 3.9643120846665028, + "grad_norm": 0.9200249910354614, + "learning_rate": 1.3035854198844672e-06, + "loss": 0.5853, + "step": 10739 + }, + { + "epoch": 3.9646812699975387, + "grad_norm": 0.8918241262435913, + "learning_rate": 1.3031583802470379e-06, + "loss": 0.5264, + "step": 10740 + }, + { + "epoch": 3.965050455328575, + "grad_norm": 0.9325987100601196, + "learning_rate": 1.302731385909685e-06, + "loss": 0.6181, + "step": 10741 + }, + { + "epoch": 3.9654196406596114, + "grad_norm": 0.9330899715423584, + "learning_rate": 1.3023044368885714e-06, + "loss": 0.581, + "step": 10742 + }, + { + "epoch": 3.9657888259906473, + "grad_norm": 0.9515547752380371, + "learning_rate": 1.301877533199859e-06, + "loss": 0.6136, + "step": 10743 + }, + { + "epoch": 3.966158011321683, + "grad_norm": 0.9380596280097961, + "learning_rate": 1.3014506748597018e-06, + "loss": 0.6016, + "step": 10744 + }, + { + "epoch": 3.9665271966527196, + "grad_norm": 0.9153798222541809, + "learning_rate": 1.3010238618842587e-06, + "loss": 0.5736, + "step": 10745 + }, + { + "epoch": 3.966896381983756, + "grad_norm": 0.9417311549186707, + "learning_rate": 1.3005970942896833e-06, + "loss": 0.645, + "step": 10746 + }, + { + "epoch": 3.967265567314792, + "grad_norm": 0.8958253860473633, + "learning_rate": 1.3001703720921307e-06, + "loss": 0.5332, + "step": 10747 + }, + { + "epoch": 3.967634752645828, + "grad_norm": 0.9217292666435242, + "learning_rate": 1.2997436953077508e-06, + "loss": 0.5801, + "step": 10748 + }, + { + "epoch": 3.9680039379768646, + "grad_norm": 0.9417540431022644, + "learning_rate": 1.299317063952692e-06, + "loss": 0.62, + "step": 10749 + }, + { + "epoch": 3.9683731233079005, + "grad_norm": 0.9450987577438354, + "learning_rate": 1.2988904780431033e-06, + "loss": 0.6008, + "step": 10750 + }, + { + "epoch": 3.968742308638937, + "grad_norm": 0.929845929145813, + "learning_rate": 1.298463937595133e-06, + "loss": 0.6247, + "step": 10751 + }, + { + "epoch": 3.969111493969973, + "grad_norm": 0.9167857766151428, + "learning_rate": 1.2980374426249218e-06, + "loss": 0.6167, + "step": 10752 + }, + { + "epoch": 3.969480679301009, + "grad_norm": 0.9457307457923889, + "learning_rate": 1.2976109931486147e-06, + "loss": 0.5745, + "step": 10753 + }, + { + "epoch": 3.969849864632045, + "grad_norm": 0.94721919298172, + "learning_rate": 1.297184589182351e-06, + "loss": 0.5895, + "step": 10754 + }, + { + "epoch": 3.9702190499630814, + "grad_norm": 0.9396242499351501, + "learning_rate": 1.2967582307422722e-06, + "loss": 0.5772, + "step": 10755 + }, + { + "epoch": 3.9705882352941178, + "grad_norm": 0.9128915071487427, + "learning_rate": 1.2963319178445143e-06, + "loss": 0.5772, + "step": 10756 + }, + { + "epoch": 3.9709574206251537, + "grad_norm": 0.8870357871055603, + "learning_rate": 1.295905650505213e-06, + "loss": 0.5881, + "step": 10757 + }, + { + "epoch": 3.97132660595619, + "grad_norm": 0.9239461421966553, + "learning_rate": 1.2954794287405031e-06, + "loss": 0.5778, + "step": 10758 + }, + { + "epoch": 3.9716957912872264, + "grad_norm": 0.9430659413337708, + "learning_rate": 1.2950532525665172e-06, + "loss": 0.5956, + "step": 10759 + }, + { + "epoch": 3.9720649766182623, + "grad_norm": 0.9525439143180847, + "learning_rate": 1.2946271219993844e-06, + "loss": 0.5524, + "step": 10760 + }, + { + "epoch": 3.9724341619492987, + "grad_norm": 0.946251392364502, + "learning_rate": 1.2942010370552355e-06, + "loss": 0.5914, + "step": 10761 + }, + { + "epoch": 3.9728033472803346, + "grad_norm": 0.9255157113075256, + "learning_rate": 1.2937749977501957e-06, + "loss": 0.583, + "step": 10762 + }, + { + "epoch": 3.973172532611371, + "grad_norm": 1.0022145509719849, + "learning_rate": 1.2933490041003932e-06, + "loss": 0.6126, + "step": 10763 + }, + { + "epoch": 3.973541717942407, + "grad_norm": 0.9211226105690002, + "learning_rate": 1.2929230561219485e-06, + "loss": 0.5898, + "step": 10764 + }, + { + "epoch": 3.973910903273443, + "grad_norm": 0.9274464845657349, + "learning_rate": 1.2924971538309855e-06, + "loss": 0.5982, + "step": 10765 + }, + { + "epoch": 3.9742800886044796, + "grad_norm": 0.9239576458930969, + "learning_rate": 1.2920712972436245e-06, + "loss": 0.5801, + "step": 10766 + }, + { + "epoch": 3.9746492739355155, + "grad_norm": 0.9252393245697021, + "learning_rate": 1.2916454863759838e-06, + "loss": 0.5784, + "step": 10767 + }, + { + "epoch": 3.975018459266552, + "grad_norm": 0.9363675117492676, + "learning_rate": 1.2912197212441802e-06, + "loss": 0.5765, + "step": 10768 + }, + { + "epoch": 3.975387644597588, + "grad_norm": 0.9069554805755615, + "learning_rate": 1.2907940018643278e-06, + "loss": 0.5492, + "step": 10769 + }, + { + "epoch": 3.975756829928624, + "grad_norm": 0.9472962617874146, + "learning_rate": 1.2903683282525406e-06, + "loss": 0.5763, + "step": 10770 + }, + { + "epoch": 3.9761260152596605, + "grad_norm": 0.9617408514022827, + "learning_rate": 1.2899427004249327e-06, + "loss": 0.6303, + "step": 10771 + }, + { + "epoch": 3.9764952005906964, + "grad_norm": 0.9285854697227478, + "learning_rate": 1.289517118397609e-06, + "loss": 0.5925, + "step": 10772 + }, + { + "epoch": 3.9768643859217327, + "grad_norm": 0.9099348187446594, + "learning_rate": 1.289091582186682e-06, + "loss": 0.5747, + "step": 10773 + }, + { + "epoch": 3.9772335712527687, + "grad_norm": 0.9417357444763184, + "learning_rate": 1.2886660918082553e-06, + "loss": 0.5901, + "step": 10774 + }, + { + "epoch": 3.977602756583805, + "grad_norm": 0.9367745518684387, + "learning_rate": 1.2882406472784356e-06, + "loss": 0.5672, + "step": 10775 + }, + { + "epoch": 3.9779719419148414, + "grad_norm": 0.915507972240448, + "learning_rate": 1.287815248613325e-06, + "loss": 0.6018, + "step": 10776 + }, + { + "epoch": 3.9783411272458773, + "grad_norm": 0.949933648109436, + "learning_rate": 1.2873898958290238e-06, + "loss": 0.608, + "step": 10777 + }, + { + "epoch": 3.9787103125769137, + "grad_norm": 0.9507619738578796, + "learning_rate": 1.2869645889416332e-06, + "loss": 0.5987, + "step": 10778 + }, + { + "epoch": 3.97907949790795, + "grad_norm": 0.9561052322387695, + "learning_rate": 1.28653932796725e-06, + "loss": 0.6102, + "step": 10779 + }, + { + "epoch": 3.979448683238986, + "grad_norm": 0.9243606328964233, + "learning_rate": 1.2861141129219695e-06, + "loss": 0.5647, + "step": 10780 + }, + { + "epoch": 3.9798178685700223, + "grad_norm": 0.912416934967041, + "learning_rate": 1.2856889438218873e-06, + "loss": 0.5643, + "step": 10781 + }, + { + "epoch": 3.980187053901058, + "grad_norm": 0.942054033279419, + "learning_rate": 1.2852638206830946e-06, + "loss": 0.5906, + "step": 10782 + }, + { + "epoch": 3.9805562392320946, + "grad_norm": 0.9137569069862366, + "learning_rate": 1.284838743521685e-06, + "loss": 0.5466, + "step": 10783 + }, + { + "epoch": 3.9809254245631305, + "grad_norm": 0.9795134663581848, + "learning_rate": 1.2844137123537433e-06, + "loss": 0.6052, + "step": 10784 + }, + { + "epoch": 3.981294609894167, + "grad_norm": 0.9261170625686646, + "learning_rate": 1.283988727195359e-06, + "loss": 0.5863, + "step": 10785 + }, + { + "epoch": 3.981663795225203, + "grad_norm": 0.9499906301498413, + "learning_rate": 1.2835637880626186e-06, + "loss": 0.5999, + "step": 10786 + }, + { + "epoch": 3.982032980556239, + "grad_norm": 0.9309170246124268, + "learning_rate": 1.2831388949716051e-06, + "loss": 0.5967, + "step": 10787 + }, + { + "epoch": 3.9824021658872755, + "grad_norm": 0.9404285550117493, + "learning_rate": 1.2827140479383993e-06, + "loss": 0.5775, + "step": 10788 + }, + { + "epoch": 3.982771351218312, + "grad_norm": 0.9485201835632324, + "learning_rate": 1.2822892469790832e-06, + "loss": 0.5761, + "step": 10789 + }, + { + "epoch": 3.9831405365493477, + "grad_norm": 0.9331871271133423, + "learning_rate": 1.2818644921097343e-06, + "loss": 0.5898, + "step": 10790 + }, + { + "epoch": 3.983509721880384, + "grad_norm": 0.9198427796363831, + "learning_rate": 1.2814397833464315e-06, + "loss": 0.5796, + "step": 10791 + }, + { + "epoch": 3.98387890721142, + "grad_norm": 0.9182186126708984, + "learning_rate": 1.2810151207052465e-06, + "loss": 0.5664, + "step": 10792 + }, + { + "epoch": 3.9842480925424564, + "grad_norm": 0.9433943033218384, + "learning_rate": 1.2805905042022545e-06, + "loss": 0.6288, + "step": 10793 + }, + { + "epoch": 3.9846172778734923, + "grad_norm": 0.9031492471694946, + "learning_rate": 1.2801659338535282e-06, + "loss": 0.5725, + "step": 10794 + }, + { + "epoch": 3.9849864632045287, + "grad_norm": 0.9265066385269165, + "learning_rate": 1.2797414096751358e-06, + "loss": 0.559, + "step": 10795 + }, + { + "epoch": 3.985355648535565, + "grad_norm": 0.9166187644004822, + "learning_rate": 1.2793169316831466e-06, + "loss": 0.5482, + "step": 10796 + }, + { + "epoch": 3.985724833866601, + "grad_norm": 0.9648780226707458, + "learning_rate": 1.2788924998936248e-06, + "loss": 0.6249, + "step": 10797 + }, + { + "epoch": 3.9860940191976373, + "grad_norm": 0.9251810312271118, + "learning_rate": 1.2784681143226379e-06, + "loss": 0.5631, + "step": 10798 + }, + { + "epoch": 3.9864632045286736, + "grad_norm": 0.9180653095245361, + "learning_rate": 1.2780437749862471e-06, + "loss": 0.5713, + "step": 10799 + }, + { + "epoch": 3.9868323898597096, + "grad_norm": 0.9388067722320557, + "learning_rate": 1.2776194819005128e-06, + "loss": 0.5759, + "step": 10800 + }, + { + "epoch": 3.987201575190746, + "grad_norm": 0.9144759178161621, + "learning_rate": 1.2771952350814962e-06, + "loss": 0.5845, + "step": 10801 + }, + { + "epoch": 3.987570760521782, + "grad_norm": 0.9221771955490112, + "learning_rate": 1.2767710345452533e-06, + "loss": 0.5925, + "step": 10802 + }, + { + "epoch": 3.987939945852818, + "grad_norm": 0.9495056867599487, + "learning_rate": 1.2763468803078414e-06, + "loss": 0.6047, + "step": 10803 + }, + { + "epoch": 3.988309131183854, + "grad_norm": 0.9302650094032288, + "learning_rate": 1.275922772385314e-06, + "loss": 0.5675, + "step": 10804 + }, + { + "epoch": 3.9886783165148905, + "grad_norm": 0.9310543537139893, + "learning_rate": 1.2754987107937223e-06, + "loss": 0.5584, + "step": 10805 + }, + { + "epoch": 3.989047501845927, + "grad_norm": 0.9011749029159546, + "learning_rate": 1.2750746955491188e-06, + "loss": 0.573, + "step": 10806 + }, + { + "epoch": 3.9894166871769627, + "grad_norm": 0.931578516960144, + "learning_rate": 1.2746507266675518e-06, + "loss": 0.6197, + "step": 10807 + }, + { + "epoch": 3.989785872507999, + "grad_norm": 0.9468767642974854, + "learning_rate": 1.2742268041650669e-06, + "loss": 0.5831, + "step": 10808 + }, + { + "epoch": 3.9901550578390355, + "grad_norm": 0.9077659845352173, + "learning_rate": 1.2738029280577115e-06, + "loss": 0.5905, + "step": 10809 + }, + { + "epoch": 3.9905242431700714, + "grad_norm": 0.9293115139007568, + "learning_rate": 1.2733790983615274e-06, + "loss": 0.6032, + "step": 10810 + }, + { + "epoch": 3.9908934285011073, + "grad_norm": 0.9243516325950623, + "learning_rate": 1.2729553150925595e-06, + "loss": 0.6137, + "step": 10811 + }, + { + "epoch": 3.9912626138321436, + "grad_norm": 0.9205523729324341, + "learning_rate": 1.2725315782668434e-06, + "loss": 0.5763, + "step": 10812 + }, + { + "epoch": 3.99163179916318, + "grad_norm": 0.9119263887405396, + "learning_rate": 1.2721078879004197e-06, + "loss": 0.5517, + "step": 10813 + }, + { + "epoch": 3.992000984494216, + "grad_norm": 0.9388280510902405, + "learning_rate": 1.271684244009326e-06, + "loss": 0.6049, + "step": 10814 + }, + { + "epoch": 3.9923701698252523, + "grad_norm": 0.9176425337791443, + "learning_rate": 1.271260646609596e-06, + "loss": 0.6083, + "step": 10815 + }, + { + "epoch": 3.9927393551562886, + "grad_norm": 0.9429195523262024, + "learning_rate": 1.2708370957172628e-06, + "loss": 0.5961, + "step": 10816 + }, + { + "epoch": 3.9931085404873246, + "grad_norm": 0.9816126227378845, + "learning_rate": 1.2704135913483563e-06, + "loss": 0.604, + "step": 10817 + }, + { + "epoch": 3.993477725818361, + "grad_norm": 0.9204774498939514, + "learning_rate": 1.2699901335189085e-06, + "loss": 0.5859, + "step": 10818 + }, + { + "epoch": 3.9938469111493973, + "grad_norm": 0.9334805607795715, + "learning_rate": 1.269566722244946e-06, + "loss": 0.5577, + "step": 10819 + }, + { + "epoch": 3.994216096480433, + "grad_norm": 0.9273476004600525, + "learning_rate": 1.2691433575424938e-06, + "loss": 0.5578, + "step": 10820 + }, + { + "epoch": 3.994585281811469, + "grad_norm": 0.9387087821960449, + "learning_rate": 1.2687200394275778e-06, + "loss": 0.5693, + "step": 10821 + }, + { + "epoch": 3.9949544671425055, + "grad_norm": 0.9456417560577393, + "learning_rate": 1.2682967679162188e-06, + "loss": 0.5957, + "step": 10822 + }, + { + "epoch": 3.995323652473542, + "grad_norm": 0.9555090069770813, + "learning_rate": 1.2678735430244393e-06, + "loss": 0.5945, + "step": 10823 + }, + { + "epoch": 3.9956928378045777, + "grad_norm": 0.9324617385864258, + "learning_rate": 1.2674503647682577e-06, + "loss": 0.5575, + "step": 10824 + }, + { + "epoch": 3.996062023135614, + "grad_norm": 0.95463627576828, + "learning_rate": 1.2670272331636897e-06, + "loss": 0.6125, + "step": 10825 + }, + { + "epoch": 3.9964312084666505, + "grad_norm": 0.948390543460846, + "learning_rate": 1.2666041482267525e-06, + "loss": 0.6039, + "step": 10826 + }, + { + "epoch": 3.9968003937976864, + "grad_norm": 0.925609827041626, + "learning_rate": 1.2661811099734592e-06, + "loss": 0.587, + "step": 10827 + }, + { + "epoch": 3.9971695791287227, + "grad_norm": 0.9204981327056885, + "learning_rate": 1.2657581184198204e-06, + "loss": 0.5593, + "step": 10828 + }, + { + "epoch": 3.9975387644597586, + "grad_norm": 0.9595109224319458, + "learning_rate": 1.2653351735818483e-06, + "loss": 0.5771, + "step": 10829 + }, + { + "epoch": 3.997907949790795, + "grad_norm": 0.9377339482307434, + "learning_rate": 1.2649122754755494e-06, + "loss": 0.5572, + "step": 10830 + }, + { + "epoch": 3.998277135121831, + "grad_norm": 0.92191082239151, + "learning_rate": 1.264489424116932e-06, + "loss": 0.5693, + "step": 10831 + }, + { + "epoch": 3.9986463204528673, + "grad_norm": 0.9193907976150513, + "learning_rate": 1.2640666195219998e-06, + "loss": 0.5668, + "step": 10832 + }, + { + "epoch": 3.9990155057839036, + "grad_norm": 0.916500985622406, + "learning_rate": 1.263643861706755e-06, + "loss": 0.5687, + "step": 10833 + }, + { + "epoch": 3.9993846911149395, + "grad_norm": 0.9654544591903687, + "learning_rate": 1.2632211506872011e-06, + "loss": 0.5818, + "step": 10834 + }, + { + "epoch": 3.999753876445976, + "grad_norm": 0.9468583464622498, + "learning_rate": 1.262798486479336e-06, + "loss": 0.5685, + "step": 10835 + }, + { + "epoch": 4.0, + "grad_norm": 0.9468583464622498, + "learning_rate": 1.2623758690991569e-06, + "loss": 0.5648, + "step": 10836 + }, + { + "epoch": 4.000369185331036, + "grad_norm": 1.278399109840393, + "learning_rate": 1.2619532985626615e-06, + "loss": 0.5544, + "step": 10837 + }, + { + "epoch": 4.000738370662073, + "grad_norm": 0.9362720251083374, + "learning_rate": 1.2615307748858429e-06, + "loss": 0.5815, + "step": 10838 + }, + { + "epoch": 4.001107555993109, + "grad_norm": 0.9367736577987671, + "learning_rate": 1.2611082980846939e-06, + "loss": 0.5802, + "step": 10839 + }, + { + "epoch": 4.0014767413241445, + "grad_norm": 0.9402756094932556, + "learning_rate": 1.2606858681752037e-06, + "loss": 0.6149, + "step": 10840 + }, + { + "epoch": 4.0018459266551805, + "grad_norm": 0.9351396560668945, + "learning_rate": 1.260263485173362e-06, + "loss": 0.6082, + "step": 10841 + }, + { + "epoch": 4.002215111986217, + "grad_norm": 0.8978464603424072, + "learning_rate": 1.2598411490951572e-06, + "loss": 0.6024, + "step": 10842 + }, + { + "epoch": 4.002584297317253, + "grad_norm": 0.9330450296401978, + "learning_rate": 1.2594188599565737e-06, + "loss": 0.5731, + "step": 10843 + }, + { + "epoch": 4.002953482648289, + "grad_norm": 0.9200763702392578, + "learning_rate": 1.2589966177735948e-06, + "loss": 0.5413, + "step": 10844 + }, + { + "epoch": 4.003322667979326, + "grad_norm": 0.9131563305854797, + "learning_rate": 1.2585744225622013e-06, + "loss": 0.5817, + "step": 10845 + }, + { + "epoch": 4.003691853310362, + "grad_norm": 0.9399466514587402, + "learning_rate": 1.258152274338375e-06, + "loss": 0.5079, + "step": 10846 + }, + { + "epoch": 4.004061038641398, + "grad_norm": 0.9336439371109009, + "learning_rate": 1.257730173118093e-06, + "loss": 0.6196, + "step": 10847 + }, + { + "epoch": 4.0044302239724345, + "grad_norm": 0.9303733110427856, + "learning_rate": 1.2573081189173314e-06, + "loss": 0.5538, + "step": 10848 + }, + { + "epoch": 4.0047994093034704, + "grad_norm": 0.939836859703064, + "learning_rate": 1.2568861117520658e-06, + "loss": 0.5698, + "step": 10849 + }, + { + "epoch": 4.005168594634506, + "grad_norm": 0.9361497163772583, + "learning_rate": 1.2564641516382677e-06, + "loss": 0.5518, + "step": 10850 + }, + { + "epoch": 4.005537779965542, + "grad_norm": 0.9537571668624878, + "learning_rate": 1.2560422385919103e-06, + "loss": 0.5692, + "step": 10851 + }, + { + "epoch": 4.005906965296579, + "grad_norm": 0.9417486190795898, + "learning_rate": 1.2556203726289608e-06, + "loss": 0.559, + "step": 10852 + }, + { + "epoch": 4.006276150627615, + "grad_norm": 0.9365049004554749, + "learning_rate": 1.255198553765387e-06, + "loss": 0.548, + "step": 10853 + }, + { + "epoch": 4.006645335958651, + "grad_norm": 0.9349099397659302, + "learning_rate": 1.2547767820171561e-06, + "loss": 0.5753, + "step": 10854 + }, + { + "epoch": 4.007014521289688, + "grad_norm": 0.9277873039245605, + "learning_rate": 1.254355057400231e-06, + "loss": 0.6089, + "step": 10855 + }, + { + "epoch": 4.007383706620724, + "grad_norm": 0.9331421852111816, + "learning_rate": 1.2539333799305725e-06, + "loss": 0.5946, + "step": 10856 + }, + { + "epoch": 4.0077528919517595, + "grad_norm": 0.9349789023399353, + "learning_rate": 1.253511749624144e-06, + "loss": 0.583, + "step": 10857 + }, + { + "epoch": 4.008122077282796, + "grad_norm": 0.9848392009735107, + "learning_rate": 1.2530901664969017e-06, + "loss": 0.5975, + "step": 10858 + }, + { + "epoch": 4.008491262613832, + "grad_norm": 0.950854480266571, + "learning_rate": 1.2526686305648033e-06, + "loss": 0.5974, + "step": 10859 + }, + { + "epoch": 4.008860447944868, + "grad_norm": 0.9344750642776489, + "learning_rate": 1.2522471418438026e-06, + "loss": 0.5502, + "step": 10860 + }, + { + "epoch": 4.009229633275904, + "grad_norm": 0.947098970413208, + "learning_rate": 1.2518257003498537e-06, + "loss": 0.5565, + "step": 10861 + }, + { + "epoch": 4.009598818606941, + "grad_norm": 0.9229680895805359, + "learning_rate": 1.2514043060989093e-06, + "loss": 0.5801, + "step": 10862 + }, + { + "epoch": 4.009968003937977, + "grad_norm": 0.9057174921035767, + "learning_rate": 1.2509829591069179e-06, + "loss": 0.553, + "step": 10863 + }, + { + "epoch": 4.010337189269013, + "grad_norm": 0.9440445303916931, + "learning_rate": 1.250561659389826e-06, + "loss": 0.5551, + "step": 10864 + }, + { + "epoch": 4.0107063746000495, + "grad_norm": 0.8987023234367371, + "learning_rate": 1.2501404069635819e-06, + "loss": 0.5941, + "step": 10865 + }, + { + "epoch": 4.011075559931085, + "grad_norm": 0.8900052905082703, + "learning_rate": 1.2497192018441293e-06, + "loss": 0.5843, + "step": 10866 + }, + { + "epoch": 4.011444745262121, + "grad_norm": 0.9331539869308472, + "learning_rate": 1.24929804404741e-06, + "loss": 0.5888, + "step": 10867 + }, + { + "epoch": 4.011813930593158, + "grad_norm": 0.9467595815658569, + "learning_rate": 1.2488769335893641e-06, + "loss": 0.5714, + "step": 10868 + }, + { + "epoch": 4.012183115924194, + "grad_norm": 0.9435511231422424, + "learning_rate": 1.2484558704859316e-06, + "loss": 0.5793, + "step": 10869 + }, + { + "epoch": 4.01255230125523, + "grad_norm": 0.9329623579978943, + "learning_rate": 1.2480348547530502e-06, + "loss": 0.5599, + "step": 10870 + }, + { + "epoch": 4.012921486586266, + "grad_norm": 0.9311836361885071, + "learning_rate": 1.2476138864066545e-06, + "loss": 0.5882, + "step": 10871 + }, + { + "epoch": 4.013290671917303, + "grad_norm": 0.9308541417121887, + "learning_rate": 1.247192965462678e-06, + "loss": 0.5937, + "step": 10872 + }, + { + "epoch": 4.013659857248339, + "grad_norm": 0.9807907342910767, + "learning_rate": 1.2467720919370513e-06, + "loss": 0.5754, + "step": 10873 + }, + { + "epoch": 4.0140290425793745, + "grad_norm": 0.926995575428009, + "learning_rate": 1.2463512658457065e-06, + "loss": 0.5414, + "step": 10874 + }, + { + "epoch": 4.014398227910411, + "grad_norm": 0.951440155506134, + "learning_rate": 1.2459304872045703e-06, + "loss": 0.544, + "step": 10875 + }, + { + "epoch": 4.014767413241447, + "grad_norm": 0.9332583546638489, + "learning_rate": 1.2455097560295687e-06, + "loss": 0.5718, + "step": 10876 + }, + { + "epoch": 4.015136598572483, + "grad_norm": 0.9149236083030701, + "learning_rate": 1.2450890723366277e-06, + "loss": 0.5909, + "step": 10877 + }, + { + "epoch": 4.01550578390352, + "grad_norm": 0.9414253830909729, + "learning_rate": 1.2446684361416694e-06, + "loss": 0.5576, + "step": 10878 + }, + { + "epoch": 4.015874969234556, + "grad_norm": 0.9545751810073853, + "learning_rate": 1.2442478474606135e-06, + "loss": 0.5762, + "step": 10879 + }, + { + "epoch": 4.016244154565592, + "grad_norm": 0.9198082685470581, + "learning_rate": 1.2438273063093811e-06, + "loss": 0.6008, + "step": 10880 + }, + { + "epoch": 4.016613339896628, + "grad_norm": 0.9693282246589661, + "learning_rate": 1.2434068127038882e-06, + "loss": 0.5882, + "step": 10881 + }, + { + "epoch": 4.0169825252276645, + "grad_norm": 0.9554240703582764, + "learning_rate": 1.2429863666600514e-06, + "loss": 0.5784, + "step": 10882 + }, + { + "epoch": 4.0173517105587, + "grad_norm": 0.9339196085929871, + "learning_rate": 1.242565968193784e-06, + "loss": 0.5694, + "step": 10883 + }, + { + "epoch": 4.017720895889736, + "grad_norm": 1.0189228057861328, + "learning_rate": 1.242145617320997e-06, + "loss": 0.6314, + "step": 10884 + }, + { + "epoch": 4.018090081220773, + "grad_norm": 0.9653273820877075, + "learning_rate": 1.2417253140576025e-06, + "loss": 0.553, + "step": 10885 + }, + { + "epoch": 4.018459266551809, + "grad_norm": 0.947452962398529, + "learning_rate": 1.2413050584195075e-06, + "loss": 0.5821, + "step": 10886 + }, + { + "epoch": 4.018828451882845, + "grad_norm": 0.933356761932373, + "learning_rate": 1.2408848504226188e-06, + "loss": 0.5734, + "step": 10887 + }, + { + "epoch": 4.019197637213882, + "grad_norm": 0.9411876797676086, + "learning_rate": 1.24046469008284e-06, + "loss": 0.6245, + "step": 10888 + }, + { + "epoch": 4.019566822544918, + "grad_norm": 0.9467840194702148, + "learning_rate": 1.2400445774160755e-06, + "loss": 0.5646, + "step": 10889 + }, + { + "epoch": 4.019936007875954, + "grad_norm": 0.9585430026054382, + "learning_rate": 1.239624512438227e-06, + "loss": 0.5797, + "step": 10890 + }, + { + "epoch": 4.0203051932069895, + "grad_norm": 0.9219393134117126, + "learning_rate": 1.2392044951651926e-06, + "loss": 0.5706, + "step": 10891 + }, + { + "epoch": 4.020674378538026, + "grad_norm": 0.9319815635681152, + "learning_rate": 1.2387845256128705e-06, + "loss": 0.5735, + "step": 10892 + }, + { + "epoch": 4.021043563869062, + "grad_norm": 0.9642653465270996, + "learning_rate": 1.238364603797155e-06, + "loss": 0.5885, + "step": 10893 + }, + { + "epoch": 4.021412749200098, + "grad_norm": 0.8793693780899048, + "learning_rate": 1.237944729733942e-06, + "loss": 0.5742, + "step": 10894 + }, + { + "epoch": 4.021781934531135, + "grad_norm": 0.9429925680160522, + "learning_rate": 1.2375249034391227e-06, + "loss": 0.5714, + "step": 10895 + }, + { + "epoch": 4.022151119862171, + "grad_norm": 0.9215102195739746, + "learning_rate": 1.2371051249285865e-06, + "loss": 0.5764, + "step": 10896 + }, + { + "epoch": 4.022520305193207, + "grad_norm": 0.945725679397583, + "learning_rate": 1.2366853942182225e-06, + "loss": 0.5902, + "step": 10897 + }, + { + "epoch": 4.022889490524244, + "grad_norm": 0.9442395567893982, + "learning_rate": 1.2362657113239195e-06, + "loss": 0.5857, + "step": 10898 + }, + { + "epoch": 4.0232586758552795, + "grad_norm": 0.9640471339225769, + "learning_rate": 1.235846076261559e-06, + "loss": 0.5972, + "step": 10899 + }, + { + "epoch": 4.023627861186315, + "grad_norm": 0.940978467464447, + "learning_rate": 1.2354264890470263e-06, + "loss": 0.5648, + "step": 10900 + }, + { + "epoch": 4.023997046517351, + "grad_norm": 0.9312800168991089, + "learning_rate": 1.2350069496962009e-06, + "loss": 0.5705, + "step": 10901 + }, + { + "epoch": 4.024366231848388, + "grad_norm": 0.9299803972244263, + "learning_rate": 1.234587458224964e-06, + "loss": 0.563, + "step": 10902 + }, + { + "epoch": 4.024735417179424, + "grad_norm": 0.9392796754837036, + "learning_rate": 1.2341680146491927e-06, + "loss": 0.5782, + "step": 10903 + }, + { + "epoch": 4.02510460251046, + "grad_norm": 0.9518870711326599, + "learning_rate": 1.2337486189847615e-06, + "loss": 0.5451, + "step": 10904 + }, + { + "epoch": 4.025473787841497, + "grad_norm": 0.921649694442749, + "learning_rate": 1.2333292712475461e-06, + "loss": 0.5333, + "step": 10905 + }, + { + "epoch": 4.025842973172533, + "grad_norm": 0.9170935750007629, + "learning_rate": 1.2329099714534184e-06, + "loss": 0.5946, + "step": 10906 + }, + { + "epoch": 4.026212158503569, + "grad_norm": 0.9533656239509583, + "learning_rate": 1.2324907196182472e-06, + "loss": 0.5432, + "step": 10907 + }, + { + "epoch": 4.0265813438346045, + "grad_norm": 0.9239468574523926, + "learning_rate": 1.2320715157579032e-06, + "loss": 0.5451, + "step": 10908 + }, + { + "epoch": 4.026950529165641, + "grad_norm": 0.9561068415641785, + "learning_rate": 1.2316523598882516e-06, + "loss": 0.5981, + "step": 10909 + }, + { + "epoch": 4.027319714496677, + "grad_norm": 0.9308600425720215, + "learning_rate": 1.2312332520251585e-06, + "loss": 0.5812, + "step": 10910 + }, + { + "epoch": 4.027688899827713, + "grad_norm": 0.9166485071182251, + "learning_rate": 1.2308141921844863e-06, + "loss": 0.5691, + "step": 10911 + }, + { + "epoch": 4.02805808515875, + "grad_norm": 0.9248771667480469, + "learning_rate": 1.2303951803820955e-06, + "loss": 0.5617, + "step": 10912 + }, + { + "epoch": 4.028427270489786, + "grad_norm": 0.9282413721084595, + "learning_rate": 1.2299762166338475e-06, + "loss": 0.5528, + "step": 10913 + }, + { + "epoch": 4.028796455820822, + "grad_norm": 0.9161816835403442, + "learning_rate": 1.2295573009555986e-06, + "loss": 0.5723, + "step": 10914 + }, + { + "epoch": 4.029165641151859, + "grad_norm": 0.9502900242805481, + "learning_rate": 1.2291384333632054e-06, + "loss": 0.5576, + "step": 10915 + }, + { + "epoch": 4.0295348264828945, + "grad_norm": 0.93194979429245, + "learning_rate": 1.2287196138725198e-06, + "loss": 0.5509, + "step": 10916 + }, + { + "epoch": 4.02990401181393, + "grad_norm": 0.9294022917747498, + "learning_rate": 1.228300842499396e-06, + "loss": 0.5569, + "step": 10917 + }, + { + "epoch": 4.030273197144966, + "grad_norm": 0.9431362748146057, + "learning_rate": 1.227882119259686e-06, + "loss": 0.6004, + "step": 10918 + }, + { + "epoch": 4.030642382476003, + "grad_norm": 0.956056535243988, + "learning_rate": 1.227463444169234e-06, + "loss": 0.5756, + "step": 10919 + }, + { + "epoch": 4.031011567807039, + "grad_norm": 0.9601041674613953, + "learning_rate": 1.2270448172438901e-06, + "loss": 0.5551, + "step": 10920 + }, + { + "epoch": 4.031380753138075, + "grad_norm": 0.9304986000061035, + "learning_rate": 1.2266262384994974e-06, + "loss": 0.5649, + "step": 10921 + }, + { + "epoch": 4.031749938469112, + "grad_norm": 0.9143564105033875, + "learning_rate": 1.2262077079519002e-06, + "loss": 0.555, + "step": 10922 + }, + { + "epoch": 4.032119123800148, + "grad_norm": 0.9427992701530457, + "learning_rate": 1.2257892256169396e-06, + "loss": 0.6033, + "step": 10923 + }, + { + "epoch": 4.032488309131184, + "grad_norm": 0.9546697735786438, + "learning_rate": 1.2253707915104538e-06, + "loss": 0.5924, + "step": 10924 + }, + { + "epoch": 4.03285749446222, + "grad_norm": 0.9382930994033813, + "learning_rate": 1.224952405648282e-06, + "loss": 0.5842, + "step": 10925 + }, + { + "epoch": 4.033226679793256, + "grad_norm": 0.93663090467453, + "learning_rate": 1.2245340680462592e-06, + "loss": 0.5749, + "step": 10926 + }, + { + "epoch": 4.033595865124292, + "grad_norm": 0.9335216283798218, + "learning_rate": 1.2241157787202185e-06, + "loss": 0.574, + "step": 10927 + }, + { + "epoch": 4.033965050455328, + "grad_norm": 0.9382838606834412, + "learning_rate": 1.2236975376859939e-06, + "loss": 0.5799, + "step": 10928 + }, + { + "epoch": 4.034334235786365, + "grad_norm": 0.9321743249893188, + "learning_rate": 1.2232793449594135e-06, + "loss": 0.545, + "step": 10929 + }, + { + "epoch": 4.034703421117401, + "grad_norm": 0.9302109479904175, + "learning_rate": 1.2228612005563082e-06, + "loss": 0.5413, + "step": 10930 + }, + { + "epoch": 4.035072606448437, + "grad_norm": 0.9297304153442383, + "learning_rate": 1.2224431044925033e-06, + "loss": 0.5744, + "step": 10931 + }, + { + "epoch": 4.035441791779474, + "grad_norm": 0.937210202217102, + "learning_rate": 1.2220250567838226e-06, + "loss": 0.5703, + "step": 10932 + }, + { + "epoch": 4.0358109771105095, + "grad_norm": 0.9105681777000427, + "learning_rate": 1.2216070574460913e-06, + "loss": 0.5671, + "step": 10933 + }, + { + "epoch": 4.036180162441545, + "grad_norm": 0.9303982257843018, + "learning_rate": 1.2211891064951294e-06, + "loss": 0.5757, + "step": 10934 + }, + { + "epoch": 4.036549347772582, + "grad_norm": 0.9483687877655029, + "learning_rate": 1.220771203946755e-06, + "loss": 0.5906, + "step": 10935 + }, + { + "epoch": 4.036918533103618, + "grad_norm": 0.9583637118339539, + "learning_rate": 1.2203533498167882e-06, + "loss": 0.5925, + "step": 10936 + }, + { + "epoch": 4.037287718434654, + "grad_norm": 0.9799261093139648, + "learning_rate": 1.219935544121042e-06, + "loss": 0.5962, + "step": 10937 + }, + { + "epoch": 4.03765690376569, + "grad_norm": 0.9385157823562622, + "learning_rate": 1.2195177868753324e-06, + "loss": 0.575, + "step": 10938 + }, + { + "epoch": 4.038026089096727, + "grad_norm": 0.9379318356513977, + "learning_rate": 1.2191000780954707e-06, + "loss": 0.5709, + "step": 10939 + }, + { + "epoch": 4.038395274427763, + "grad_norm": 0.9296746253967285, + "learning_rate": 1.2186824177972657e-06, + "loss": 0.567, + "step": 10940 + }, + { + "epoch": 4.038764459758799, + "grad_norm": 0.9206678867340088, + "learning_rate": 1.2182648059965275e-06, + "loss": 0.5809, + "step": 10941 + }, + { + "epoch": 4.039133645089835, + "grad_norm": 0.9736154675483704, + "learning_rate": 1.2178472427090621e-06, + "loss": 0.5691, + "step": 10942 + }, + { + "epoch": 4.039502830420871, + "grad_norm": 0.9338938593864441, + "learning_rate": 1.217429727950674e-06, + "loss": 0.6189, + "step": 10943 + }, + { + "epoch": 4.039872015751907, + "grad_norm": 0.9498529434204102, + "learning_rate": 1.2170122617371646e-06, + "loss": 0.6028, + "step": 10944 + }, + { + "epoch": 4.040241201082944, + "grad_norm": 1.1917463541030884, + "learning_rate": 1.2165948440843364e-06, + "loss": 0.5746, + "step": 10945 + }, + { + "epoch": 4.04061038641398, + "grad_norm": 0.9233893156051636, + "learning_rate": 1.2161774750079903e-06, + "loss": 0.5815, + "step": 10946 + }, + { + "epoch": 4.040979571745016, + "grad_norm": 0.931523323059082, + "learning_rate": 1.2157601545239197e-06, + "loss": 0.5942, + "step": 10947 + }, + { + "epoch": 4.041348757076052, + "grad_norm": 0.9564704895019531, + "learning_rate": 1.2153428826479228e-06, + "loss": 0.5918, + "step": 10948 + }, + { + "epoch": 4.041717942407089, + "grad_norm": 0.9211004972457886, + "learning_rate": 1.2149256593957914e-06, + "loss": 0.5809, + "step": 10949 + }, + { + "epoch": 4.0420871277381245, + "grad_norm": 0.9532545208930969, + "learning_rate": 1.2145084847833192e-06, + "loss": 0.5785, + "step": 10950 + }, + { + "epoch": 4.04245631306916, + "grad_norm": 0.9689568281173706, + "learning_rate": 1.2140913588262952e-06, + "loss": 0.5692, + "step": 10951 + }, + { + "epoch": 4.042825498400197, + "grad_norm": 0.9656877517700195, + "learning_rate": 1.2136742815405062e-06, + "loss": 0.578, + "step": 10952 + }, + { + "epoch": 4.043194683731233, + "grad_norm": 0.9553203582763672, + "learning_rate": 1.2132572529417408e-06, + "loss": 0.5766, + "step": 10953 + }, + { + "epoch": 4.043563869062269, + "grad_norm": 0.9460417628288269, + "learning_rate": 1.2128402730457823e-06, + "loss": 0.5938, + "step": 10954 + }, + { + "epoch": 4.043933054393306, + "grad_norm": 0.9493420124053955, + "learning_rate": 1.2124233418684122e-06, + "loss": 0.592, + "step": 10955 + }, + { + "epoch": 4.044302239724342, + "grad_norm": 0.9581380486488342, + "learning_rate": 1.2120064594254132e-06, + "loss": 0.5906, + "step": 10956 + }, + { + "epoch": 4.044671425055378, + "grad_norm": 0.9323729872703552, + "learning_rate": 1.2115896257325623e-06, + "loss": 0.5703, + "step": 10957 + }, + { + "epoch": 4.045040610386414, + "grad_norm": 0.9437854886054993, + "learning_rate": 1.2111728408056385e-06, + "loss": 0.5673, + "step": 10958 + }, + { + "epoch": 4.04540979571745, + "grad_norm": 0.9538025856018066, + "learning_rate": 1.2107561046604158e-06, + "loss": 0.6219, + "step": 10959 + }, + { + "epoch": 4.045778981048486, + "grad_norm": 0.923189103603363, + "learning_rate": 1.2103394173126668e-06, + "loss": 0.5736, + "step": 10960 + }, + { + "epoch": 4.046148166379522, + "grad_norm": 0.9357838034629822, + "learning_rate": 1.2099227787781648e-06, + "loss": 0.5953, + "step": 10961 + }, + { + "epoch": 4.046517351710559, + "grad_norm": 0.9375860691070557, + "learning_rate": 1.2095061890726783e-06, + "loss": 0.5982, + "step": 10962 + }, + { + "epoch": 4.046886537041595, + "grad_norm": 0.941388726234436, + "learning_rate": 1.2090896482119752e-06, + "loss": 0.5618, + "step": 10963 + }, + { + "epoch": 4.047255722372631, + "grad_norm": 0.9246278405189514, + "learning_rate": 1.2086731562118207e-06, + "loss": 0.5443, + "step": 10964 + }, + { + "epoch": 4.047624907703668, + "grad_norm": 0.9216983914375305, + "learning_rate": 1.2082567130879795e-06, + "loss": 0.5218, + "step": 10965 + }, + { + "epoch": 4.047994093034704, + "grad_norm": 0.8817718029022217, + "learning_rate": 1.2078403188562163e-06, + "loss": 0.5916, + "step": 10966 + }, + { + "epoch": 4.0483632783657395, + "grad_norm": 0.9124845266342163, + "learning_rate": 1.207423973532287e-06, + "loss": 0.5681, + "step": 10967 + }, + { + "epoch": 4.048732463696775, + "grad_norm": 0.9193453192710876, + "learning_rate": 1.2070076771319536e-06, + "loss": 0.5819, + "step": 10968 + }, + { + "epoch": 4.049101649027812, + "grad_norm": 0.9792888164520264, + "learning_rate": 1.2065914296709705e-06, + "loss": 0.556, + "step": 10969 + }, + { + "epoch": 4.049470834358848, + "grad_norm": 0.9531697034835815, + "learning_rate": 1.2061752311650944e-06, + "loss": 0.5727, + "step": 10970 + }, + { + "epoch": 4.049840019689884, + "grad_norm": 0.9432228803634644, + "learning_rate": 1.2057590816300774e-06, + "loss": 0.573, + "step": 10971 + }, + { + "epoch": 4.050209205020921, + "grad_norm": 0.9497817754745483, + "learning_rate": 1.20534298108167e-06, + "loss": 0.5855, + "step": 10972 + }, + { + "epoch": 4.050578390351957, + "grad_norm": 0.9661943912506104, + "learning_rate": 1.204926929535622e-06, + "loss": 0.5679, + "step": 10973 + }, + { + "epoch": 4.050947575682993, + "grad_norm": 0.9182208180427551, + "learning_rate": 1.2045109270076826e-06, + "loss": 0.5543, + "step": 10974 + }, + { + "epoch": 4.051316761014029, + "grad_norm": 0.9489305019378662, + "learning_rate": 1.2040949735135943e-06, + "loss": 0.5588, + "step": 10975 + }, + { + "epoch": 4.051685946345065, + "grad_norm": 0.929717481136322, + "learning_rate": 1.203679069069103e-06, + "loss": 0.5815, + "step": 10976 + }, + { + "epoch": 4.052055131676101, + "grad_norm": 0.9598423838615417, + "learning_rate": 1.203263213689949e-06, + "loss": 0.569, + "step": 10977 + }, + { + "epoch": 4.052424317007137, + "grad_norm": 0.9369844198226929, + "learning_rate": 1.202847407391874e-06, + "loss": 0.5489, + "step": 10978 + }, + { + "epoch": 4.052793502338174, + "grad_norm": 0.9485114812850952, + "learning_rate": 1.2024316501906152e-06, + "loss": 0.5569, + "step": 10979 + }, + { + "epoch": 4.05316268766921, + "grad_norm": 0.8902856707572937, + "learning_rate": 1.2020159421019081e-06, + "loss": 0.546, + "step": 10980 + }, + { + "epoch": 4.053531873000246, + "grad_norm": 0.9592317938804626, + "learning_rate": 1.201600283141489e-06, + "loss": 0.5611, + "step": 10981 + }, + { + "epoch": 4.053901058331283, + "grad_norm": 0.9342727661132812, + "learning_rate": 1.201184673325089e-06, + "loss": 0.5673, + "step": 10982 + }, + { + "epoch": 4.054270243662319, + "grad_norm": 0.9311964511871338, + "learning_rate": 1.2007691126684384e-06, + "loss": 0.593, + "step": 10983 + }, + { + "epoch": 4.0546394289933545, + "grad_norm": 0.9509251713752747, + "learning_rate": 1.2003536011872677e-06, + "loss": 0.5515, + "step": 10984 + }, + { + "epoch": 4.05500861432439, + "grad_norm": 0.9372209310531616, + "learning_rate": 1.1999381388973022e-06, + "loss": 0.5571, + "step": 10985 + }, + { + "epoch": 4.055377799655427, + "grad_norm": 0.9362126588821411, + "learning_rate": 1.1995227258142698e-06, + "loss": 0.5953, + "step": 10986 + }, + { + "epoch": 4.055746984986463, + "grad_norm": 0.9544323682785034, + "learning_rate": 1.1991073619538898e-06, + "loss": 0.5757, + "step": 10987 + }, + { + "epoch": 4.056116170317499, + "grad_norm": 0.961902379989624, + "learning_rate": 1.1986920473318856e-06, + "loss": 0.5756, + "step": 10988 + }, + { + "epoch": 4.056485355648536, + "grad_norm": 0.9421068429946899, + "learning_rate": 1.1982767819639777e-06, + "loss": 0.5822, + "step": 10989 + }, + { + "epoch": 4.056854540979572, + "grad_norm": 0.939942479133606, + "learning_rate": 1.1978615658658828e-06, + "loss": 0.5324, + "step": 10990 + }, + { + "epoch": 4.057223726310608, + "grad_norm": 0.9413439631462097, + "learning_rate": 1.1974463990533167e-06, + "loss": 0.5747, + "step": 10991 + }, + { + "epoch": 4.0575929116416445, + "grad_norm": 0.9599975943565369, + "learning_rate": 1.197031281541992e-06, + "loss": 0.5906, + "step": 10992 + }, + { + "epoch": 4.05796209697268, + "grad_norm": 0.9435157179832458, + "learning_rate": 1.1966162133476225e-06, + "loss": 0.6009, + "step": 10993 + }, + { + "epoch": 4.058331282303716, + "grad_norm": 0.9354333877563477, + "learning_rate": 1.1962011944859197e-06, + "loss": 0.6252, + "step": 10994 + }, + { + "epoch": 4.058700467634752, + "grad_norm": 0.9470630884170532, + "learning_rate": 1.1957862249725883e-06, + "loss": 0.593, + "step": 10995 + }, + { + "epoch": 4.059069652965789, + "grad_norm": 0.9415104389190674, + "learning_rate": 1.1953713048233375e-06, + "loss": 0.6074, + "step": 10996 + }, + { + "epoch": 4.059438838296825, + "grad_norm": 0.9561862349510193, + "learning_rate": 1.1949564340538706e-06, + "loss": 0.5657, + "step": 10997 + }, + { + "epoch": 4.059808023627861, + "grad_norm": 0.9257074594497681, + "learning_rate": 1.1945416126798912e-06, + "loss": 0.5916, + "step": 10998 + }, + { + "epoch": 4.060177208958898, + "grad_norm": 0.9504919648170471, + "learning_rate": 1.1941268407171e-06, + "loss": 0.6005, + "step": 10999 + }, + { + "epoch": 4.060546394289934, + "grad_norm": 0.9585803151130676, + "learning_rate": 1.1937121181811948e-06, + "loss": 0.5769, + "step": 11000 + }, + { + "epoch": 4.0609155796209695, + "grad_norm": 0.9364815950393677, + "learning_rate": 1.1932974450878746e-06, + "loss": 0.581, + "step": 11001 + }, + { + "epoch": 4.061284764952006, + "grad_norm": 0.9556556344032288, + "learning_rate": 1.1928828214528334e-06, + "loss": 0.5575, + "step": 11002 + }, + { + "epoch": 4.061653950283042, + "grad_norm": 0.9413087368011475, + "learning_rate": 1.1924682472917642e-06, + "loss": 0.5711, + "step": 11003 + }, + { + "epoch": 4.062023135614078, + "grad_norm": 0.9381484389305115, + "learning_rate": 1.1920537226203602e-06, + "loss": 0.5262, + "step": 11004 + }, + { + "epoch": 4.062392320945114, + "grad_norm": 0.9178210496902466, + "learning_rate": 1.191639247454309e-06, + "loss": 0.5154, + "step": 11005 + }, + { + "epoch": 4.062761506276151, + "grad_norm": 0.9144070148468018, + "learning_rate": 1.191224821809301e-06, + "loss": 0.5914, + "step": 11006 + }, + { + "epoch": 4.063130691607187, + "grad_norm": 1.0192734003067017, + "learning_rate": 1.1908104457010188e-06, + "loss": 0.5976, + "step": 11007 + }, + { + "epoch": 4.063499876938223, + "grad_norm": 0.9365906119346619, + "learning_rate": 1.190396119145148e-06, + "loss": 0.5822, + "step": 11008 + }, + { + "epoch": 4.0638690622692595, + "grad_norm": 0.9735681414604187, + "learning_rate": 1.1899818421573718e-06, + "loss": 0.5759, + "step": 11009 + }, + { + "epoch": 4.064238247600295, + "grad_norm": 0.9936011433601379, + "learning_rate": 1.1895676147533692e-06, + "loss": 0.5609, + "step": 11010 + }, + { + "epoch": 4.064607432931331, + "grad_norm": 0.9485435485839844, + "learning_rate": 1.189153436948818e-06, + "loss": 0.5828, + "step": 11011 + }, + { + "epoch": 4.064976618262368, + "grad_norm": 0.9405845999717712, + "learning_rate": 1.1887393087593966e-06, + "loss": 0.589, + "step": 11012 + }, + { + "epoch": 4.065345803593404, + "grad_norm": 0.9732291102409363, + "learning_rate": 1.1883252302007773e-06, + "loss": 0.6195, + "step": 11013 + }, + { + "epoch": 4.06571498892444, + "grad_norm": 0.9665850400924683, + "learning_rate": 1.187911201288636e-06, + "loss": 0.572, + "step": 11014 + }, + { + "epoch": 4.066084174255476, + "grad_norm": 0.9621134400367737, + "learning_rate": 1.1874972220386399e-06, + "loss": 0.5705, + "step": 11015 + }, + { + "epoch": 4.066453359586513, + "grad_norm": 0.9646546840667725, + "learning_rate": 1.1870832924664596e-06, + "loss": 0.602, + "step": 11016 + }, + { + "epoch": 4.066822544917549, + "grad_norm": 0.956916093826294, + "learning_rate": 1.1866694125877634e-06, + "loss": 0.5876, + "step": 11017 + }, + { + "epoch": 4.0671917302485845, + "grad_norm": 0.9332398176193237, + "learning_rate": 1.1862555824182153e-06, + "loss": 0.5908, + "step": 11018 + }, + { + "epoch": 4.067560915579621, + "grad_norm": 0.9513016939163208, + "learning_rate": 1.1858418019734787e-06, + "loss": 0.5689, + "step": 11019 + }, + { + "epoch": 4.067930100910657, + "grad_norm": 0.9729171395301819, + "learning_rate": 1.1854280712692144e-06, + "loss": 0.5741, + "step": 11020 + }, + { + "epoch": 4.068299286241693, + "grad_norm": 0.9206558465957642, + "learning_rate": 1.1850143903210826e-06, + "loss": 0.5687, + "step": 11021 + }, + { + "epoch": 4.06866847157273, + "grad_norm": 0.9583123326301575, + "learning_rate": 1.184600759144743e-06, + "loss": 0.5454, + "step": 11022 + }, + { + "epoch": 4.069037656903766, + "grad_norm": 0.9157056212425232, + "learning_rate": 1.1841871777558477e-06, + "loss": 0.5738, + "step": 11023 + }, + { + "epoch": 4.069406842234802, + "grad_norm": 0.9425524473190308, + "learning_rate": 1.1837736461700532e-06, + "loss": 0.5627, + "step": 11024 + }, + { + "epoch": 4.069776027565838, + "grad_norm": 0.9328688979148865, + "learning_rate": 1.1833601644030101e-06, + "loss": 0.578, + "step": 11025 + }, + { + "epoch": 4.0701452128968745, + "grad_norm": 0.9595530033111572, + "learning_rate": 1.18294673247037e-06, + "loss": 0.6134, + "step": 11026 + }, + { + "epoch": 4.07051439822791, + "grad_norm": 0.9526441693305969, + "learning_rate": 1.1825333503877805e-06, + "loss": 0.6076, + "step": 11027 + }, + { + "epoch": 4.070883583558946, + "grad_norm": 0.9474645853042603, + "learning_rate": 1.182120018170887e-06, + "loss": 0.5739, + "step": 11028 + }, + { + "epoch": 4.071252768889983, + "grad_norm": 0.9609004259109497, + "learning_rate": 1.1817067358353357e-06, + "loss": 0.5541, + "step": 11029 + }, + { + "epoch": 4.071621954221019, + "grad_norm": 0.9330994486808777, + "learning_rate": 1.1812935033967685e-06, + "loss": 0.5486, + "step": 11030 + }, + { + "epoch": 4.071991139552055, + "grad_norm": 1.0593745708465576, + "learning_rate": 1.180880320870825e-06, + "loss": 0.5979, + "step": 11031 + }, + { + "epoch": 4.072360324883092, + "grad_norm": 0.9341237545013428, + "learning_rate": 1.1804671882731458e-06, + "loss": 0.5573, + "step": 11032 + }, + { + "epoch": 4.072729510214128, + "grad_norm": 0.9734179377555847, + "learning_rate": 1.1800541056193666e-06, + "loss": 0.5866, + "step": 11033 + }, + { + "epoch": 4.073098695545164, + "grad_norm": 0.9291723370552063, + "learning_rate": 1.1796410729251243e-06, + "loss": 0.579, + "step": 11034 + }, + { + "epoch": 4.0734678808761995, + "grad_norm": 0.9615142941474915, + "learning_rate": 1.1792280902060491e-06, + "loss": 0.5776, + "step": 11035 + }, + { + "epoch": 4.073837066207236, + "grad_norm": 0.9302604794502258, + "learning_rate": 1.1788151574777737e-06, + "loss": 0.5808, + "step": 11036 + }, + { + "epoch": 4.074206251538272, + "grad_norm": 0.9500296711921692, + "learning_rate": 1.1784022747559286e-06, + "loss": 0.5451, + "step": 11037 + }, + { + "epoch": 4.074575436869308, + "grad_norm": 0.9331037402153015, + "learning_rate": 1.1779894420561403e-06, + "loss": 0.548, + "step": 11038 + }, + { + "epoch": 4.074944622200345, + "grad_norm": 0.9373279213905334, + "learning_rate": 1.1775766593940343e-06, + "loss": 0.5554, + "step": 11039 + }, + { + "epoch": 4.075313807531381, + "grad_norm": 0.9543520212173462, + "learning_rate": 1.1771639267852336e-06, + "loss": 0.5534, + "step": 11040 + }, + { + "epoch": 4.075682992862417, + "grad_norm": 0.9318366050720215, + "learning_rate": 1.1767512442453607e-06, + "loss": 0.5671, + "step": 11041 + }, + { + "epoch": 4.076052178193454, + "grad_norm": 0.9417281150817871, + "learning_rate": 1.1763386117900376e-06, + "loss": 0.5373, + "step": 11042 + }, + { + "epoch": 4.0764213635244895, + "grad_norm": 0.9051100611686707, + "learning_rate": 1.175926029434878e-06, + "loss": 0.5692, + "step": 11043 + }, + { + "epoch": 4.076790548855525, + "grad_norm": 0.9443027973175049, + "learning_rate": 1.1755134971955017e-06, + "loss": 0.5475, + "step": 11044 + }, + { + "epoch": 4.077159734186561, + "grad_norm": 0.9643303155899048, + "learning_rate": 1.1751010150875205e-06, + "loss": 0.5981, + "step": 11045 + }, + { + "epoch": 4.077528919517598, + "grad_norm": 0.9843177795410156, + "learning_rate": 1.1746885831265486e-06, + "loss": 0.5681, + "step": 11046 + }, + { + "epoch": 4.077898104848634, + "grad_norm": 0.9183872938156128, + "learning_rate": 1.1742762013281955e-06, + "loss": 0.5628, + "step": 11047 + }, + { + "epoch": 4.07826729017967, + "grad_norm": 0.9325457215309143, + "learning_rate": 1.173863869708069e-06, + "loss": 0.5454, + "step": 11048 + }, + { + "epoch": 4.078636475510707, + "grad_norm": 0.9449958205223083, + "learning_rate": 1.1734515882817772e-06, + "loss": 0.5626, + "step": 11049 + }, + { + "epoch": 4.079005660841743, + "grad_norm": 0.9262201189994812, + "learning_rate": 1.1730393570649242e-06, + "loss": 0.5607, + "step": 11050 + }, + { + "epoch": 4.079374846172779, + "grad_norm": 0.9119241237640381, + "learning_rate": 1.1726271760731118e-06, + "loss": 0.5749, + "step": 11051 + }, + { + "epoch": 4.0797440315038145, + "grad_norm": 0.9297060966491699, + "learning_rate": 1.1722150453219428e-06, + "loss": 0.518, + "step": 11052 + }, + { + "epoch": 4.080113216834851, + "grad_norm": 0.9326635003089905, + "learning_rate": 1.1718029648270143e-06, + "loss": 0.5571, + "step": 11053 + }, + { + "epoch": 4.080482402165887, + "grad_norm": 0.9670947790145874, + "learning_rate": 1.1713909346039254e-06, + "loss": 0.5599, + "step": 11054 + }, + { + "epoch": 4.080851587496923, + "grad_norm": 0.9628434777259827, + "learning_rate": 1.1709789546682704e-06, + "loss": 0.6026, + "step": 11055 + }, + { + "epoch": 4.08122077282796, + "grad_norm": 0.9727362990379333, + "learning_rate": 1.1705670250356417e-06, + "loss": 0.617, + "step": 11056 + }, + { + "epoch": 4.081589958158996, + "grad_norm": 0.9681711792945862, + "learning_rate": 1.1701551457216321e-06, + "loss": 0.56, + "step": 11057 + }, + { + "epoch": 4.081959143490032, + "grad_norm": 0.9645869135856628, + "learning_rate": 1.1697433167418306e-06, + "loss": 0.5602, + "step": 11058 + }, + { + "epoch": 4.0823283288210686, + "grad_norm": 0.950752317905426, + "learning_rate": 1.1693315381118238e-06, + "loss": 0.605, + "step": 11059 + }, + { + "epoch": 4.0826975141521045, + "grad_norm": 0.9468844532966614, + "learning_rate": 1.1689198098471993e-06, + "loss": 0.5522, + "step": 11060 + }, + { + "epoch": 4.08306669948314, + "grad_norm": 0.9672972559928894, + "learning_rate": 1.1685081319635392e-06, + "loss": 0.5511, + "step": 11061 + }, + { + "epoch": 4.083435884814176, + "grad_norm": 0.9502522349357605, + "learning_rate": 1.1680965044764276e-06, + "loss": 0.5701, + "step": 11062 + }, + { + "epoch": 4.083805070145213, + "grad_norm": 0.9666215777397156, + "learning_rate": 1.1676849274014412e-06, + "loss": 0.6364, + "step": 11063 + }, + { + "epoch": 4.084174255476249, + "grad_norm": 0.9429125189781189, + "learning_rate": 1.16727340075416e-06, + "loss": 0.5752, + "step": 11064 + }, + { + "epoch": 4.084543440807285, + "grad_norm": 0.9480336308479309, + "learning_rate": 1.1668619245501606e-06, + "loss": 0.5782, + "step": 11065 + }, + { + "epoch": 4.084912626138322, + "grad_norm": 0.9612739086151123, + "learning_rate": 1.166450498805017e-06, + "loss": 0.5681, + "step": 11066 + }, + { + "epoch": 4.085281811469358, + "grad_norm": 0.9716250896453857, + "learning_rate": 1.1660391235343008e-06, + "loss": 0.5856, + "step": 11067 + }, + { + "epoch": 4.085650996800394, + "grad_norm": 0.9585658311843872, + "learning_rate": 1.1656277987535819e-06, + "loss": 0.5614, + "step": 11068 + }, + { + "epoch": 4.08602018213143, + "grad_norm": 0.9301977753639221, + "learning_rate": 1.1652165244784306e-06, + "loss": 0.6023, + "step": 11069 + }, + { + "epoch": 4.086389367462466, + "grad_norm": 0.9480682611465454, + "learning_rate": 1.1648053007244127e-06, + "loss": 0.5872, + "step": 11070 + }, + { + "epoch": 4.086758552793502, + "grad_norm": 0.9282101988792419, + "learning_rate": 1.1643941275070917e-06, + "loss": 0.5667, + "step": 11071 + }, + { + "epoch": 4.087127738124538, + "grad_norm": 0.9550393223762512, + "learning_rate": 1.1639830048420325e-06, + "loss": 0.5956, + "step": 11072 + }, + { + "epoch": 4.087496923455575, + "grad_norm": 0.9652387499809265, + "learning_rate": 1.163571932744794e-06, + "loss": 0.6584, + "step": 11073 + }, + { + "epoch": 4.087866108786611, + "grad_norm": 0.937347412109375, + "learning_rate": 1.1631609112309367e-06, + "loss": 0.5846, + "step": 11074 + }, + { + "epoch": 4.088235294117647, + "grad_norm": 0.9334675073623657, + "learning_rate": 1.1627499403160172e-06, + "loss": 0.541, + "step": 11075 + }, + { + "epoch": 4.0886044794486835, + "grad_norm": 0.9363857507705688, + "learning_rate": 1.1623390200155896e-06, + "loss": 0.5719, + "step": 11076 + }, + { + "epoch": 4.0889736647797195, + "grad_norm": 0.9533934593200684, + "learning_rate": 1.1619281503452088e-06, + "loss": 0.5658, + "step": 11077 + }, + { + "epoch": 4.089342850110755, + "grad_norm": 0.949829638004303, + "learning_rate": 1.161517331320425e-06, + "loss": 0.5619, + "step": 11078 + }, + { + "epoch": 4.089712035441792, + "grad_norm": 0.9355161190032959, + "learning_rate": 1.161106562956787e-06, + "loss": 0.5733, + "step": 11079 + }, + { + "epoch": 4.090081220772828, + "grad_norm": 0.9463686347007751, + "learning_rate": 1.1606958452698442e-06, + "loss": 0.5898, + "step": 11080 + }, + { + "epoch": 4.090450406103864, + "grad_norm": 0.8554579615592957, + "learning_rate": 1.1602851782751398e-06, + "loss": 0.5166, + "step": 11081 + }, + { + "epoch": 4.0908195914349, + "grad_norm": 0.9226812124252319, + "learning_rate": 1.1598745619882196e-06, + "loss": 0.5581, + "step": 11082 + }, + { + "epoch": 4.091188776765937, + "grad_norm": 0.9674760699272156, + "learning_rate": 1.159463996424624e-06, + "loss": 0.6014, + "step": 11083 + }, + { + "epoch": 4.091557962096973, + "grad_norm": 0.9723924994468689, + "learning_rate": 1.1590534815998924e-06, + "loss": 0.5997, + "step": 11084 + }, + { + "epoch": 4.091927147428009, + "grad_norm": 0.9520078301429749, + "learning_rate": 1.1586430175295642e-06, + "loss": 0.5615, + "step": 11085 + }, + { + "epoch": 4.092296332759045, + "grad_norm": 0.9497332572937012, + "learning_rate": 1.1582326042291744e-06, + "loss": 0.5328, + "step": 11086 + }, + { + "epoch": 4.092665518090081, + "grad_norm": 0.9119506478309631, + "learning_rate": 1.1578222417142561e-06, + "loss": 0.6039, + "step": 11087 + }, + { + "epoch": 4.093034703421117, + "grad_norm": 0.9467915296554565, + "learning_rate": 1.1574119300003433e-06, + "loss": 0.5988, + "step": 11088 + }, + { + "epoch": 4.093403888752154, + "grad_norm": 0.9453967213630676, + "learning_rate": 1.1570016691029654e-06, + "loss": 0.5829, + "step": 11089 + }, + { + "epoch": 4.09377307408319, + "grad_norm": 0.9620592594146729, + "learning_rate": 1.1565914590376503e-06, + "loss": 0.5746, + "step": 11090 + }, + { + "epoch": 4.094142259414226, + "grad_norm": 0.954048216342926, + "learning_rate": 1.1561812998199235e-06, + "loss": 0.5877, + "step": 11091 + }, + { + "epoch": 4.094511444745262, + "grad_norm": 0.9660516977310181, + "learning_rate": 1.1557711914653106e-06, + "loss": 0.5747, + "step": 11092 + }, + { + "epoch": 4.0948806300762985, + "grad_norm": 0.9346960783004761, + "learning_rate": 1.1553611339893344e-06, + "loss": 0.571, + "step": 11093 + }, + { + "epoch": 4.0952498154073345, + "grad_norm": 0.9447574019432068, + "learning_rate": 1.1549511274075154e-06, + "loss": 0.5359, + "step": 11094 + }, + { + "epoch": 4.09561900073837, + "grad_norm": 0.9615253806114197, + "learning_rate": 1.1545411717353714e-06, + "loss": 0.5663, + "step": 11095 + }, + { + "epoch": 4.095988186069407, + "grad_norm": 0.9325082898139954, + "learning_rate": 1.1541312669884186e-06, + "loss": 0.6066, + "step": 11096 + }, + { + "epoch": 4.096357371400443, + "grad_norm": 0.9953460693359375, + "learning_rate": 1.1537214131821736e-06, + "loss": 0.5501, + "step": 11097 + }, + { + "epoch": 4.096726556731479, + "grad_norm": 0.9549190998077393, + "learning_rate": 1.1533116103321479e-06, + "loss": 0.5367, + "step": 11098 + }, + { + "epoch": 4.097095742062516, + "grad_norm": 0.9162867665290833, + "learning_rate": 1.152901858453852e-06, + "loss": 0.5789, + "step": 11099 + }, + { + "epoch": 4.097464927393552, + "grad_norm": 0.9408636093139648, + "learning_rate": 1.1524921575627967e-06, + "loss": 0.5506, + "step": 11100 + }, + { + "epoch": 4.097834112724588, + "grad_norm": 0.959723174571991, + "learning_rate": 1.1520825076744867e-06, + "loss": 0.568, + "step": 11101 + }, + { + "epoch": 4.0982032980556236, + "grad_norm": 0.9843590259552002, + "learning_rate": 1.1516729088044296e-06, + "loss": 0.6147, + "step": 11102 + }, + { + "epoch": 4.09857248338666, + "grad_norm": 0.9929567575454712, + "learning_rate": 1.1512633609681273e-06, + "loss": 0.6076, + "step": 11103 + }, + { + "epoch": 4.098941668717696, + "grad_norm": 0.976847767829895, + "learning_rate": 1.1508538641810802e-06, + "loss": 0.621, + "step": 11104 + }, + { + "epoch": 4.099310854048732, + "grad_norm": 0.9672521352767944, + "learning_rate": 1.1504444184587893e-06, + "loss": 0.5377, + "step": 11105 + }, + { + "epoch": 4.099680039379769, + "grad_norm": 0.9361658096313477, + "learning_rate": 1.1500350238167512e-06, + "loss": 0.5731, + "step": 11106 + }, + { + "epoch": 4.100049224710805, + "grad_norm": 0.954596996307373, + "learning_rate": 1.1496256802704605e-06, + "loss": 0.5675, + "step": 11107 + }, + { + "epoch": 4.100418410041841, + "grad_norm": 0.8981071710586548, + "learning_rate": 1.149216387835413e-06, + "loss": 0.5671, + "step": 11108 + }, + { + "epoch": 4.100787595372877, + "grad_norm": 1.0206749439239502, + "learning_rate": 1.1488071465270984e-06, + "loss": 0.5769, + "step": 11109 + }, + { + "epoch": 4.1011567807039135, + "grad_norm": 0.9290213584899902, + "learning_rate": 1.148397956361007e-06, + "loss": 0.5711, + "step": 11110 + }, + { + "epoch": 4.1015259660349495, + "grad_norm": 0.9345331788063049, + "learning_rate": 1.1479888173526254e-06, + "loss": 0.5969, + "step": 11111 + }, + { + "epoch": 4.101895151365985, + "grad_norm": 0.9552022218704224, + "learning_rate": 1.1475797295174404e-06, + "loss": 0.595, + "step": 11112 + }, + { + "epoch": 4.102264336697022, + "grad_norm": 0.9492177367210388, + "learning_rate": 1.147170692870937e-06, + "loss": 0.5807, + "step": 11113 + }, + { + "epoch": 4.102633522028058, + "grad_norm": 0.9222142100334167, + "learning_rate": 1.1467617074285956e-06, + "loss": 0.5555, + "step": 11114 + }, + { + "epoch": 4.103002707359094, + "grad_norm": 0.9358189105987549, + "learning_rate": 1.1463527732058966e-06, + "loss": 0.6017, + "step": 11115 + }, + { + "epoch": 4.103371892690131, + "grad_norm": 0.989465057849884, + "learning_rate": 1.1459438902183167e-06, + "loss": 0.571, + "step": 11116 + }, + { + "epoch": 4.103741078021167, + "grad_norm": 0.9483144879341125, + "learning_rate": 1.1455350584813345e-06, + "loss": 0.6103, + "step": 11117 + }, + { + "epoch": 4.104110263352203, + "grad_norm": 0.935605525970459, + "learning_rate": 1.1451262780104228e-06, + "loss": 0.549, + "step": 11118 + }, + { + "epoch": 4.1044794486832386, + "grad_norm": 0.9326417446136475, + "learning_rate": 1.144717548821053e-06, + "loss": 0.5521, + "step": 11119 + }, + { + "epoch": 4.104848634014275, + "grad_norm": 0.9447380304336548, + "learning_rate": 1.144308870928696e-06, + "loss": 0.5772, + "step": 11120 + }, + { + "epoch": 4.105217819345311, + "grad_norm": 0.9764747619628906, + "learning_rate": 1.1439002443488215e-06, + "loss": 0.5233, + "step": 11121 + }, + { + "epoch": 4.105587004676347, + "grad_norm": 0.9486321806907654, + "learning_rate": 1.1434916690968947e-06, + "loss": 0.5601, + "step": 11122 + }, + { + "epoch": 4.105956190007384, + "grad_norm": 0.9211226105690002, + "learning_rate": 1.14308314518838e-06, + "loss": 0.5706, + "step": 11123 + }, + { + "epoch": 4.10632537533842, + "grad_norm": 0.9766160845756531, + "learning_rate": 1.1426746726387392e-06, + "loss": 0.5875, + "step": 11124 + }, + { + "epoch": 4.106694560669456, + "grad_norm": 0.9692445397377014, + "learning_rate": 1.1422662514634345e-06, + "loss": 0.5828, + "step": 11125 + }, + { + "epoch": 4.107063746000493, + "grad_norm": 0.9724578261375427, + "learning_rate": 1.141857881677924e-06, + "loss": 0.5793, + "step": 11126 + }, + { + "epoch": 4.1074329313315285, + "grad_norm": 0.961215078830719, + "learning_rate": 1.1414495632976627e-06, + "loss": 0.5879, + "step": 11127 + }, + { + "epoch": 4.1078021166625645, + "grad_norm": 0.9645792245864868, + "learning_rate": 1.1410412963381076e-06, + "loss": 0.5674, + "step": 11128 + }, + { + "epoch": 4.1081713019936, + "grad_norm": 0.9325297474861145, + "learning_rate": 1.1406330808147109e-06, + "loss": 0.5689, + "step": 11129 + }, + { + "epoch": 4.108540487324637, + "grad_norm": 0.9468836784362793, + "learning_rate": 1.1402249167429219e-06, + "loss": 0.5848, + "step": 11130 + }, + { + "epoch": 4.108909672655673, + "grad_norm": 0.9817432165145874, + "learning_rate": 1.1398168041381916e-06, + "loss": 0.5393, + "step": 11131 + }, + { + "epoch": 4.109278857986709, + "grad_norm": 0.9390926957130432, + "learning_rate": 1.139408743015965e-06, + "loss": 0.5636, + "step": 11132 + }, + { + "epoch": 4.109648043317746, + "grad_norm": 0.9538723826408386, + "learning_rate": 1.1390007333916892e-06, + "loss": 0.5773, + "step": 11133 + }, + { + "epoch": 4.110017228648782, + "grad_norm": 0.9617483019828796, + "learning_rate": 1.1385927752808058e-06, + "loss": 0.5945, + "step": 11134 + }, + { + "epoch": 4.110386413979818, + "grad_norm": 0.9287585616111755, + "learning_rate": 1.1381848686987553e-06, + "loss": 0.5955, + "step": 11135 + }, + { + "epoch": 4.110755599310854, + "grad_norm": 0.9650591611862183, + "learning_rate": 1.1377770136609789e-06, + "loss": 0.6222, + "step": 11136 + }, + { + "epoch": 4.11112478464189, + "grad_norm": 0.9322013854980469, + "learning_rate": 1.1373692101829125e-06, + "loss": 0.5746, + "step": 11137 + }, + { + "epoch": 4.111493969972926, + "grad_norm": 0.9570606350898743, + "learning_rate": 1.1369614582799914e-06, + "loss": 0.5974, + "step": 11138 + }, + { + "epoch": 4.111863155303962, + "grad_norm": 0.9738538861274719, + "learning_rate": 1.136553757967648e-06, + "loss": 0.5872, + "step": 11139 + }, + { + "epoch": 4.112232340634999, + "grad_norm": 0.9629897475242615, + "learning_rate": 1.1361461092613146e-06, + "loss": 0.5766, + "step": 11140 + }, + { + "epoch": 4.112601525966035, + "grad_norm": 0.931209146976471, + "learning_rate": 1.1357385121764214e-06, + "loss": 0.5902, + "step": 11141 + }, + { + "epoch": 4.112970711297071, + "grad_norm": 0.9478116631507874, + "learning_rate": 1.1353309667283948e-06, + "loss": 0.5591, + "step": 11142 + }, + { + "epoch": 4.113339896628108, + "grad_norm": 0.9563490748405457, + "learning_rate": 1.1349234729326607e-06, + "loss": 0.5951, + "step": 11143 + }, + { + "epoch": 4.1137090819591435, + "grad_norm": 0.9927957653999329, + "learning_rate": 1.1345160308046413e-06, + "loss": 0.5498, + "step": 11144 + }, + { + "epoch": 4.1140782672901794, + "grad_norm": 0.9172760248184204, + "learning_rate": 1.1341086403597602e-06, + "loss": 0.5849, + "step": 11145 + }, + { + "epoch": 4.114447452621216, + "grad_norm": 0.9461301565170288, + "learning_rate": 1.1337013016134357e-06, + "loss": 0.5388, + "step": 11146 + }, + { + "epoch": 4.114816637952252, + "grad_norm": 0.9538074135780334, + "learning_rate": 1.133294014581085e-06, + "loss": 0.5783, + "step": 11147 + }, + { + "epoch": 4.115185823283288, + "grad_norm": 0.9560688138008118, + "learning_rate": 1.1328867792781254e-06, + "loss": 0.5221, + "step": 11148 + }, + { + "epoch": 4.115555008614324, + "grad_norm": 0.9376999735832214, + "learning_rate": 1.1324795957199688e-06, + "loss": 0.5829, + "step": 11149 + }, + { + "epoch": 4.115924193945361, + "grad_norm": 0.9412899613380432, + "learning_rate": 1.1320724639220291e-06, + "loss": 0.5778, + "step": 11150 + }, + { + "epoch": 4.116293379276397, + "grad_norm": 0.9580351710319519, + "learning_rate": 1.1316653838997144e-06, + "loss": 0.5852, + "step": 11151 + }, + { + "epoch": 4.116662564607433, + "grad_norm": 0.9471850395202637, + "learning_rate": 1.1312583556684326e-06, + "loss": 0.5471, + "step": 11152 + }, + { + "epoch": 4.117031749938469, + "grad_norm": 0.96211838722229, + "learning_rate": 1.1308513792435908e-06, + "loss": 0.5508, + "step": 11153 + }, + { + "epoch": 4.117400935269505, + "grad_norm": 0.9478123188018799, + "learning_rate": 1.130444454640592e-06, + "loss": 0.5764, + "step": 11154 + }, + { + "epoch": 4.117770120600541, + "grad_norm": 0.9533302187919617, + "learning_rate": 1.1300375818748377e-06, + "loss": 0.5602, + "step": 11155 + }, + { + "epoch": 4.118139305931578, + "grad_norm": 0.9474069476127625, + "learning_rate": 1.1296307609617294e-06, + "loss": 0.5721, + "step": 11156 + }, + { + "epoch": 4.118508491262614, + "grad_norm": 0.9598758220672607, + "learning_rate": 1.1292239919166642e-06, + "loss": 0.582, + "step": 11157 + }, + { + "epoch": 4.11887767659365, + "grad_norm": 0.9894514679908752, + "learning_rate": 1.1288172747550374e-06, + "loss": 0.5579, + "step": 11158 + }, + { + "epoch": 4.119246861924686, + "grad_norm": 0.9525651931762695, + "learning_rate": 1.128410609492245e-06, + "loss": 0.5785, + "step": 11159 + }, + { + "epoch": 4.119616047255723, + "grad_norm": 0.9541136622428894, + "learning_rate": 1.1280039961436773e-06, + "loss": 0.5801, + "step": 11160 + }, + { + "epoch": 4.1199852325867585, + "grad_norm": 0.974427342414856, + "learning_rate": 1.1275974347247259e-06, + "loss": 0.5684, + "step": 11161 + }, + { + "epoch": 4.120354417917794, + "grad_norm": 0.9597657918930054, + "learning_rate": 1.1271909252507787e-06, + "loss": 0.5907, + "step": 11162 + }, + { + "epoch": 4.120723603248831, + "grad_norm": 0.9753732085227966, + "learning_rate": 1.1267844677372209e-06, + "loss": 0.5415, + "step": 11163 + }, + { + "epoch": 4.121092788579867, + "grad_norm": 0.9399521946907043, + "learning_rate": 1.1263780621994385e-06, + "loss": 0.5882, + "step": 11164 + }, + { + "epoch": 4.121461973910903, + "grad_norm": 0.964413583278656, + "learning_rate": 1.125971708652813e-06, + "loss": 0.5375, + "step": 11165 + }, + { + "epoch": 4.12183115924194, + "grad_norm": 0.9247733354568481, + "learning_rate": 1.1255654071127245e-06, + "loss": 0.5711, + "step": 11166 + }, + { + "epoch": 4.122200344572976, + "grad_norm": 0.9274166822433472, + "learning_rate": 1.1251591575945506e-06, + "loss": 0.5948, + "step": 11167 + }, + { + "epoch": 4.122569529904012, + "grad_norm": 1.0226752758026123, + "learning_rate": 1.1247529601136687e-06, + "loss": 0.5675, + "step": 11168 + }, + { + "epoch": 4.122938715235048, + "grad_norm": 0.9428756833076477, + "learning_rate": 1.1243468146854542e-06, + "loss": 0.5906, + "step": 11169 + }, + { + "epoch": 4.123307900566084, + "grad_norm": 0.9636664986610413, + "learning_rate": 1.1239407213252786e-06, + "loss": 0.5855, + "step": 11170 + }, + { + "epoch": 4.12367708589712, + "grad_norm": 0.930568277835846, + "learning_rate": 1.1235346800485125e-06, + "loss": 0.5446, + "step": 11171 + }, + { + "epoch": 4.124046271228156, + "grad_norm": 0.9478546380996704, + "learning_rate": 1.1231286908705233e-06, + "loss": 0.5786, + "step": 11172 + }, + { + "epoch": 4.124415456559193, + "grad_norm": 0.9690397381782532, + "learning_rate": 1.1227227538066796e-06, + "loss": 0.594, + "step": 11173 + }, + { + "epoch": 4.124784641890229, + "grad_norm": 0.9725150465965271, + "learning_rate": 1.1223168688723449e-06, + "loss": 0.6049, + "step": 11174 + }, + { + "epoch": 4.125153827221265, + "grad_norm": 0.952102780342102, + "learning_rate": 1.121911036082881e-06, + "loss": 0.5828, + "step": 11175 + }, + { + "epoch": 4.125523012552302, + "grad_norm": 0.9371408820152283, + "learning_rate": 1.1215052554536504e-06, + "loss": 0.5851, + "step": 11176 + }, + { + "epoch": 4.125892197883338, + "grad_norm": 0.9958533048629761, + "learning_rate": 1.1210995270000108e-06, + "loss": 0.5704, + "step": 11177 + }, + { + "epoch": 4.1262613832143735, + "grad_norm": 1.0069059133529663, + "learning_rate": 1.120693850737318e-06, + "loss": 0.5883, + "step": 11178 + }, + { + "epoch": 4.126630568545409, + "grad_norm": 0.9853936433792114, + "learning_rate": 1.1202882266809287e-06, + "loss": 0.5282, + "step": 11179 + }, + { + "epoch": 4.126999753876446, + "grad_norm": 0.9687520861625671, + "learning_rate": 1.1198826548461934e-06, + "loss": 0.5767, + "step": 11180 + }, + { + "epoch": 4.127368939207482, + "grad_norm": 0.9859440922737122, + "learning_rate": 1.119477135248465e-06, + "loss": 0.6169, + "step": 11181 + }, + { + "epoch": 4.127738124538518, + "grad_norm": 0.9636742472648621, + "learning_rate": 1.1190716679030915e-06, + "loss": 0.571, + "step": 11182 + }, + { + "epoch": 4.128107309869555, + "grad_norm": 0.9372849464416504, + "learning_rate": 1.1186662528254185e-06, + "loss": 0.6111, + "step": 11183 + }, + { + "epoch": 4.128476495200591, + "grad_norm": 0.9638365507125854, + "learning_rate": 1.1182608900307928e-06, + "loss": 0.5809, + "step": 11184 + }, + { + "epoch": 4.128845680531627, + "grad_norm": 0.9511132836341858, + "learning_rate": 1.1178555795345564e-06, + "loss": 0.5767, + "step": 11185 + }, + { + "epoch": 4.129214865862663, + "grad_norm": 0.9616824984550476, + "learning_rate": 1.1174503213520502e-06, + "loss": 0.5156, + "step": 11186 + }, + { + "epoch": 4.129584051193699, + "grad_norm": 0.8963266611099243, + "learning_rate": 1.117045115498612e-06, + "loss": 0.56, + "step": 11187 + }, + { + "epoch": 4.129953236524735, + "grad_norm": 0.9768795967102051, + "learning_rate": 1.1166399619895795e-06, + "loss": 0.5821, + "step": 11188 + }, + { + "epoch": 4.130322421855771, + "grad_norm": 0.9231671094894409, + "learning_rate": 1.116234860840289e-06, + "loss": 0.5757, + "step": 11189 + }, + { + "epoch": 4.130691607186808, + "grad_norm": 0.9742685556411743, + "learning_rate": 1.115829812066072e-06, + "loss": 0.5637, + "step": 11190 + }, + { + "epoch": 4.131060792517844, + "grad_norm": 0.9430115222930908, + "learning_rate": 1.1154248156822603e-06, + "loss": 0.5749, + "step": 11191 + }, + { + "epoch": 4.13142997784888, + "grad_norm": 0.9463034868240356, + "learning_rate": 1.115019871704181e-06, + "loss": 0.5856, + "step": 11192 + }, + { + "epoch": 4.131799163179917, + "grad_norm": 0.9373690485954285, + "learning_rate": 1.1146149801471633e-06, + "loss": 0.5823, + "step": 11193 + }, + { + "epoch": 4.132168348510953, + "grad_norm": 0.9576979279518127, + "learning_rate": 1.114210141026532e-06, + "loss": 0.5723, + "step": 11194 + }, + { + "epoch": 4.1325375338419885, + "grad_norm": 0.9534048438072205, + "learning_rate": 1.1138053543576078e-06, + "loss": 0.5676, + "step": 11195 + }, + { + "epoch": 4.132906719173024, + "grad_norm": 0.9740688800811768, + "learning_rate": 1.1134006201557138e-06, + "loss": 0.5774, + "step": 11196 + }, + { + "epoch": 4.133275904504061, + "grad_norm": 0.9475070238113403, + "learning_rate": 1.1129959384361706e-06, + "loss": 0.5466, + "step": 11197 + }, + { + "epoch": 4.133645089835097, + "grad_norm": 0.968113899230957, + "learning_rate": 1.1125913092142916e-06, + "loss": 0.5619, + "step": 11198 + }, + { + "epoch": 4.134014275166133, + "grad_norm": 0.9620702862739563, + "learning_rate": 1.1121867325053941e-06, + "loss": 0.5772, + "step": 11199 + }, + { + "epoch": 4.13438346049717, + "grad_norm": 0.9186809659004211, + "learning_rate": 1.1117822083247903e-06, + "loss": 0.5796, + "step": 11200 + }, + { + "epoch": 4.134752645828206, + "grad_norm": 0.9834861159324646, + "learning_rate": 1.1113777366877929e-06, + "loss": 0.5761, + "step": 11201 + }, + { + "epoch": 4.135121831159242, + "grad_norm": 0.9363255500793457, + "learning_rate": 1.1109733176097095e-06, + "loss": 0.5863, + "step": 11202 + }, + { + "epoch": 4.1354910164902785, + "grad_norm": 0.9946532249450684, + "learning_rate": 1.1105689511058469e-06, + "loss": 0.5605, + "step": 11203 + }, + { + "epoch": 4.135860201821314, + "grad_norm": 0.9447387456893921, + "learning_rate": 1.1101646371915117e-06, + "loss": 0.5502, + "step": 11204 + }, + { + "epoch": 4.13622938715235, + "grad_norm": 0.9436483383178711, + "learning_rate": 1.1097603758820066e-06, + "loss": 0.5682, + "step": 11205 + }, + { + "epoch": 4.136598572483386, + "grad_norm": 0.9700493812561035, + "learning_rate": 1.1093561671926314e-06, + "loss": 0.5718, + "step": 11206 + }, + { + "epoch": 4.136967757814423, + "grad_norm": 0.9287259578704834, + "learning_rate": 1.1089520111386876e-06, + "loss": 0.5995, + "step": 11207 + }, + { + "epoch": 4.137336943145459, + "grad_norm": 0.9434582591056824, + "learning_rate": 1.10854790773547e-06, + "loss": 0.5458, + "step": 11208 + }, + { + "epoch": 4.137706128476495, + "grad_norm": 0.9431796669960022, + "learning_rate": 1.1081438569982758e-06, + "loss": 0.542, + "step": 11209 + }, + { + "epoch": 4.138075313807532, + "grad_norm": 0.9364577531814575, + "learning_rate": 1.1077398589423974e-06, + "loss": 0.5778, + "step": 11210 + }, + { + "epoch": 4.138444499138568, + "grad_norm": 0.9415451884269714, + "learning_rate": 1.1073359135831251e-06, + "loss": 0.5535, + "step": 11211 + }, + { + "epoch": 4.1388136844696035, + "grad_norm": 0.9249799251556396, + "learning_rate": 1.1069320209357497e-06, + "loss": 0.5687, + "step": 11212 + }, + { + "epoch": 4.13918286980064, + "grad_norm": 0.9468775391578674, + "learning_rate": 1.1065281810155578e-06, + "loss": 0.5941, + "step": 11213 + }, + { + "epoch": 4.139552055131676, + "grad_norm": 0.9668189883232117, + "learning_rate": 1.1061243938378345e-06, + "loss": 0.561, + "step": 11214 + }, + { + "epoch": 4.139921240462712, + "grad_norm": 0.9315990209579468, + "learning_rate": 1.105720659417862e-06, + "loss": 0.576, + "step": 11215 + }, + { + "epoch": 4.140290425793748, + "grad_norm": 0.9645841121673584, + "learning_rate": 1.1053169777709225e-06, + "loss": 0.588, + "step": 11216 + }, + { + "epoch": 4.140659611124785, + "grad_norm": 0.9578668475151062, + "learning_rate": 1.104913348912297e-06, + "loss": 0.5628, + "step": 11217 + }, + { + "epoch": 4.141028796455821, + "grad_norm": 0.9660695791244507, + "learning_rate": 1.1045097728572592e-06, + "loss": 0.5602, + "step": 11218 + }, + { + "epoch": 4.141397981786857, + "grad_norm": 0.9668000340461731, + "learning_rate": 1.104106249621087e-06, + "loss": 0.5518, + "step": 11219 + }, + { + "epoch": 4.1417671671178935, + "grad_norm": 0.9151424169540405, + "learning_rate": 1.1037027792190515e-06, + "loss": 0.5773, + "step": 11220 + }, + { + "epoch": 4.142136352448929, + "grad_norm": 0.9234116673469543, + "learning_rate": 1.1032993616664265e-06, + "loss": 0.5582, + "step": 11221 + }, + { + "epoch": 4.142505537779965, + "grad_norm": 0.9722840189933777, + "learning_rate": 1.1028959969784797e-06, + "loss": 0.5725, + "step": 11222 + }, + { + "epoch": 4.142874723111002, + "grad_norm": 0.9631638526916504, + "learning_rate": 1.1024926851704773e-06, + "loss": 0.5752, + "step": 11223 + }, + { + "epoch": 4.143243908442038, + "grad_norm": 0.9614027142524719, + "learning_rate": 1.102089426257687e-06, + "loss": 0.5715, + "step": 11224 + }, + { + "epoch": 4.143613093773074, + "grad_norm": 0.9940634965896606, + "learning_rate": 1.1016862202553704e-06, + "loss": 0.5914, + "step": 11225 + }, + { + "epoch": 4.14398227910411, + "grad_norm": 0.9434228539466858, + "learning_rate": 1.1012830671787884e-06, + "loss": 0.525, + "step": 11226 + }, + { + "epoch": 4.144351464435147, + "grad_norm": 0.9514734148979187, + "learning_rate": 1.1008799670432016e-06, + "loss": 0.6044, + "step": 11227 + }, + { + "epoch": 4.144720649766183, + "grad_norm": 0.9649513363838196, + "learning_rate": 1.1004769198638657e-06, + "loss": 0.5831, + "step": 11228 + }, + { + "epoch": 4.1450898350972185, + "grad_norm": 0.9530631899833679, + "learning_rate": 1.1000739256560374e-06, + "loss": 0.5494, + "step": 11229 + }, + { + "epoch": 4.145459020428255, + "grad_norm": 0.9248853921890259, + "learning_rate": 1.0996709844349692e-06, + "loss": 0.5674, + "step": 11230 + }, + { + "epoch": 4.145828205759291, + "grad_norm": 0.9865295886993408, + "learning_rate": 1.0992680962159116e-06, + "loss": 0.5979, + "step": 11231 + }, + { + "epoch": 4.146197391090327, + "grad_norm": 0.9787861704826355, + "learning_rate": 1.0988652610141154e-06, + "loss": 0.5675, + "step": 11232 + }, + { + "epoch": 4.146566576421364, + "grad_norm": 0.9346125721931458, + "learning_rate": 1.0984624788448266e-06, + "loss": 0.5508, + "step": 11233 + }, + { + "epoch": 4.1469357617524, + "grad_norm": 0.9244520664215088, + "learning_rate": 1.09805974972329e-06, + "loss": 0.5892, + "step": 11234 + }, + { + "epoch": 4.147304947083436, + "grad_norm": 0.9553045630455017, + "learning_rate": 1.09765707366475e-06, + "loss": 0.5644, + "step": 11235 + }, + { + "epoch": 4.147674132414472, + "grad_norm": 0.931896448135376, + "learning_rate": 1.0972544506844465e-06, + "loss": 0.5979, + "step": 11236 + }, + { + "epoch": 4.1480433177455085, + "grad_norm": 0.9620617628097534, + "learning_rate": 1.096851880797621e-06, + "loss": 0.5922, + "step": 11237 + }, + { + "epoch": 4.148412503076544, + "grad_norm": 0.9810640811920166, + "learning_rate": 1.0964493640195075e-06, + "loss": 0.5641, + "step": 11238 + }, + { + "epoch": 4.14878168840758, + "grad_norm": 0.9404276609420776, + "learning_rate": 1.0960469003653423e-06, + "loss": 0.5436, + "step": 11239 + }, + { + "epoch": 4.149150873738617, + "grad_norm": 0.9704576134681702, + "learning_rate": 1.09564448985036e-06, + "loss": 0.5964, + "step": 11240 + }, + { + "epoch": 4.149520059069653, + "grad_norm": 0.9774445295333862, + "learning_rate": 1.0952421324897903e-06, + "loss": 0.5756, + "step": 11241 + }, + { + "epoch": 4.149889244400689, + "grad_norm": 0.9566097259521484, + "learning_rate": 1.0948398282988626e-06, + "loss": 0.5538, + "step": 11242 + }, + { + "epoch": 4.150258429731725, + "grad_norm": 0.9660483002662659, + "learning_rate": 1.0944375772928029e-06, + "loss": 0.5581, + "step": 11243 + }, + { + "epoch": 4.150627615062762, + "grad_norm": 1.2582674026489258, + "learning_rate": 1.0940353794868374e-06, + "loss": 0.5638, + "step": 11244 + }, + { + "epoch": 4.150996800393798, + "grad_norm": 0.9391007423400879, + "learning_rate": 1.093633234896191e-06, + "loss": 0.5634, + "step": 11245 + }, + { + "epoch": 4.1513659857248335, + "grad_norm": 0.9489935636520386, + "learning_rate": 1.0932311435360808e-06, + "loss": 0.5622, + "step": 11246 + }, + { + "epoch": 4.15173517105587, + "grad_norm": 0.9860347509384155, + "learning_rate": 1.0928291054217288e-06, + "loss": 0.6072, + "step": 11247 + }, + { + "epoch": 4.152104356386906, + "grad_norm": 0.9433343410491943, + "learning_rate": 1.09242712056835e-06, + "loss": 0.5711, + "step": 11248 + }, + { + "epoch": 4.152473541717942, + "grad_norm": 0.9246366620063782, + "learning_rate": 1.0920251889911615e-06, + "loss": 0.5533, + "step": 11249 + }, + { + "epoch": 4.152842727048979, + "grad_norm": 0.9306853413581848, + "learning_rate": 1.0916233107053753e-06, + "loss": 0.6031, + "step": 11250 + }, + { + "epoch": 4.153211912380015, + "grad_norm": 0.9500686526298523, + "learning_rate": 1.0912214857262013e-06, + "loss": 0.5857, + "step": 11251 + }, + { + "epoch": 4.153581097711051, + "grad_norm": 0.9675424695014954, + "learning_rate": 1.0908197140688504e-06, + "loss": 0.5821, + "step": 11252 + }, + { + "epoch": 4.153950283042088, + "grad_norm": 0.954265832901001, + "learning_rate": 1.0904179957485287e-06, + "loss": 0.5324, + "step": 11253 + }, + { + "epoch": 4.1543194683731235, + "grad_norm": 0.935479998588562, + "learning_rate": 1.0900163307804397e-06, + "loss": 0.5519, + "step": 11254 + }, + { + "epoch": 4.154688653704159, + "grad_norm": 0.9669307470321655, + "learning_rate": 1.089614719179789e-06, + "loss": 0.5654, + "step": 11255 + }, + { + "epoch": 4.155057839035195, + "grad_norm": 0.9220905900001526, + "learning_rate": 1.0892131609617748e-06, + "loss": 0.5762, + "step": 11256 + }, + { + "epoch": 4.155427024366232, + "grad_norm": 0.9273605942726135, + "learning_rate": 1.088811656141599e-06, + "loss": 0.5733, + "step": 11257 + }, + { + "epoch": 4.155796209697268, + "grad_norm": 0.9249597787857056, + "learning_rate": 1.088410204734455e-06, + "loss": 0.5827, + "step": 11258 + }, + { + "epoch": 4.156165395028304, + "grad_norm": 0.9469605684280396, + "learning_rate": 1.0880088067555392e-06, + "loss": 0.6, + "step": 11259 + }, + { + "epoch": 4.156534580359341, + "grad_norm": 0.9270947575569153, + "learning_rate": 1.0876074622200453e-06, + "loss": 0.5629, + "step": 11260 + }, + { + "epoch": 4.156903765690377, + "grad_norm": 0.9235016107559204, + "learning_rate": 1.0872061711431634e-06, + "loss": 0.5644, + "step": 11261 + }, + { + "epoch": 4.157272951021413, + "grad_norm": 0.9530863761901855, + "learning_rate": 1.0868049335400815e-06, + "loss": 0.5832, + "step": 11262 + }, + { + "epoch": 4.1576421363524485, + "grad_norm": 0.9522402286529541, + "learning_rate": 1.0864037494259862e-06, + "loss": 0.603, + "step": 11263 + }, + { + "epoch": 4.158011321683485, + "grad_norm": 0.9799947738647461, + "learning_rate": 1.0860026188160633e-06, + "loss": 0.5804, + "step": 11264 + }, + { + "epoch": 4.158380507014521, + "grad_norm": 0.9533648490905762, + "learning_rate": 1.0856015417254964e-06, + "loss": 0.5653, + "step": 11265 + }, + { + "epoch": 4.158749692345557, + "grad_norm": 0.9859068393707275, + "learning_rate": 1.085200518169463e-06, + "loss": 0.6011, + "step": 11266 + }, + { + "epoch": 4.159118877676594, + "grad_norm": 0.9635876417160034, + "learning_rate": 1.0847995481631434e-06, + "loss": 0.5512, + "step": 11267 + }, + { + "epoch": 4.15948806300763, + "grad_norm": 0.9522500038146973, + "learning_rate": 1.0843986317217154e-06, + "loss": 0.576, + "step": 11268 + }, + { + "epoch": 4.159857248338666, + "grad_norm": 0.9319966435432434, + "learning_rate": 1.083997768860352e-06, + "loss": 0.5975, + "step": 11269 + }, + { + "epoch": 4.160226433669703, + "grad_norm": 1.0011208057403564, + "learning_rate": 1.0835969595942266e-06, + "loss": 0.5944, + "step": 11270 + }, + { + "epoch": 4.1605956190007385, + "grad_norm": 0.9397270083427429, + "learning_rate": 1.083196203938508e-06, + "loss": 0.5597, + "step": 11271 + }, + { + "epoch": 4.160964804331774, + "grad_norm": 0.9930569529533386, + "learning_rate": 1.082795501908366e-06, + "loss": 0.5428, + "step": 11272 + }, + { + "epoch": 4.16133398966281, + "grad_norm": 0.9288663864135742, + "learning_rate": 1.0823948535189691e-06, + "loss": 0.5567, + "step": 11273 + }, + { + "epoch": 4.161703174993847, + "grad_norm": 0.9603767395019531, + "learning_rate": 1.0819942587854777e-06, + "loss": 0.5914, + "step": 11274 + }, + { + "epoch": 4.162072360324883, + "grad_norm": 0.9442951083183289, + "learning_rate": 1.0815937177230567e-06, + "loss": 0.5912, + "step": 11275 + }, + { + "epoch": 4.162441545655919, + "grad_norm": 0.9592859745025635, + "learning_rate": 1.081193230346865e-06, + "loss": 0.5734, + "step": 11276 + }, + { + "epoch": 4.162810730986956, + "grad_norm": 0.9713202118873596, + "learning_rate": 1.0807927966720626e-06, + "loss": 0.5818, + "step": 11277 + }, + { + "epoch": 4.163179916317992, + "grad_norm": 0.9482781887054443, + "learning_rate": 1.0803924167138053e-06, + "loss": 0.554, + "step": 11278 + }, + { + "epoch": 4.163549101649028, + "grad_norm": 0.9335405826568604, + "learning_rate": 1.0799920904872457e-06, + "loss": 0.5497, + "step": 11279 + }, + { + "epoch": 4.163918286980064, + "grad_norm": 0.9374227523803711, + "learning_rate": 1.0795918180075386e-06, + "loss": 0.6149, + "step": 11280 + }, + { + "epoch": 4.1642874723111, + "grad_norm": 0.9652079939842224, + "learning_rate": 1.0791915992898325e-06, + "loss": 0.5404, + "step": 11281 + }, + { + "epoch": 4.164656657642136, + "grad_norm": 0.9654465317726135, + "learning_rate": 1.0787914343492753e-06, + "loss": 0.5948, + "step": 11282 + }, + { + "epoch": 4.165025842973172, + "grad_norm": 0.9743331074714661, + "learning_rate": 1.0783913232010146e-06, + "loss": 0.5829, + "step": 11283 + }, + { + "epoch": 4.165395028304209, + "grad_norm": 0.9182899594306946, + "learning_rate": 1.0779912658601927e-06, + "loss": 0.5458, + "step": 11284 + }, + { + "epoch": 4.165764213635245, + "grad_norm": 0.9287759065628052, + "learning_rate": 1.0775912623419543e-06, + "loss": 0.5607, + "step": 11285 + }, + { + "epoch": 4.166133398966281, + "grad_norm": 0.9550282955169678, + "learning_rate": 1.077191312661436e-06, + "loss": 0.5858, + "step": 11286 + }, + { + "epoch": 4.166502584297318, + "grad_norm": 0.9354690909385681, + "learning_rate": 1.0767914168337775e-06, + "loss": 0.5501, + "step": 11287 + }, + { + "epoch": 4.1668717696283535, + "grad_norm": 0.9695242047309875, + "learning_rate": 1.0763915748741157e-06, + "loss": 0.5997, + "step": 11288 + }, + { + "epoch": 4.167240954959389, + "grad_norm": 0.9486453533172607, + "learning_rate": 1.0759917867975834e-06, + "loss": 0.571, + "step": 11289 + }, + { + "epoch": 4.167610140290426, + "grad_norm": 0.9267899990081787, + "learning_rate": 1.0755920526193125e-06, + "loss": 0.5731, + "step": 11290 + }, + { + "epoch": 4.167979325621462, + "grad_norm": 0.9675056338310242, + "learning_rate": 1.0751923723544322e-06, + "loss": 0.5606, + "step": 11291 + }, + { + "epoch": 4.168348510952498, + "grad_norm": 0.945411741733551, + "learning_rate": 1.0747927460180706e-06, + "loss": 0.5729, + "step": 11292 + }, + { + "epoch": 4.168717696283534, + "grad_norm": 1.0129516124725342, + "learning_rate": 1.0743931736253557e-06, + "loss": 0.5552, + "step": 11293 + }, + { + "epoch": 4.169086881614571, + "grad_norm": 0.9628455638885498, + "learning_rate": 1.0739936551914074e-06, + "loss": 0.6027, + "step": 11294 + }, + { + "epoch": 4.169456066945607, + "grad_norm": 0.9585238695144653, + "learning_rate": 1.0735941907313501e-06, + "loss": 0.5657, + "step": 11295 + }, + { + "epoch": 4.169825252276643, + "grad_norm": 0.9398294687271118, + "learning_rate": 1.0731947802603018e-06, + "loss": 0.5707, + "step": 11296 + }, + { + "epoch": 4.170194437607679, + "grad_norm": 0.9765138626098633, + "learning_rate": 1.0727954237933814e-06, + "loss": 0.5721, + "step": 11297 + }, + { + "epoch": 4.170563622938715, + "grad_norm": 0.9076434969902039, + "learning_rate": 1.0723961213457041e-06, + "loss": 0.5769, + "step": 11298 + }, + { + "epoch": 4.170932808269751, + "grad_norm": 0.9738903045654297, + "learning_rate": 1.0719968729323818e-06, + "loss": 0.5565, + "step": 11299 + }, + { + "epoch": 4.171301993600788, + "grad_norm": 0.9659448266029358, + "learning_rate": 1.0715976785685282e-06, + "loss": 0.5701, + "step": 11300 + }, + { + "epoch": 4.171671178931824, + "grad_norm": 0.9478740692138672, + "learning_rate": 1.0711985382692517e-06, + "loss": 0.5568, + "step": 11301 + }, + { + "epoch": 4.17204036426286, + "grad_norm": 0.9405163526535034, + "learning_rate": 1.0707994520496585e-06, + "loss": 0.5728, + "step": 11302 + }, + { + "epoch": 4.172409549593896, + "grad_norm": 0.9771952033042908, + "learning_rate": 1.070400419924856e-06, + "loss": 0.597, + "step": 11303 + }, + { + "epoch": 4.172778734924933, + "grad_norm": 0.9731000065803528, + "learning_rate": 1.0700014419099456e-06, + "loss": 0.5909, + "step": 11304 + }, + { + "epoch": 4.1731479202559685, + "grad_norm": 0.9337641000747681, + "learning_rate": 1.0696025180200301e-06, + "loss": 0.5763, + "step": 11305 + }, + { + "epoch": 4.173517105587004, + "grad_norm": 1.0115346908569336, + "learning_rate": 1.069203648270208e-06, + "loss": 0.5886, + "step": 11306 + }, + { + "epoch": 4.173886290918041, + "grad_norm": 0.9594628214836121, + "learning_rate": 1.068804832675575e-06, + "loss": 0.5841, + "step": 11307 + }, + { + "epoch": 4.174255476249077, + "grad_norm": 0.9425783753395081, + "learning_rate": 1.0684060712512286e-06, + "loss": 0.5688, + "step": 11308 + }, + { + "epoch": 4.174624661580113, + "grad_norm": 0.9458253979682922, + "learning_rate": 1.0680073640122606e-06, + "loss": 0.5348, + "step": 11309 + }, + { + "epoch": 4.17499384691115, + "grad_norm": 0.9414954781532288, + "learning_rate": 1.067608710973761e-06, + "loss": 0.5715, + "step": 11310 + }, + { + "epoch": 4.175363032242186, + "grad_norm": 0.9433185458183289, + "learning_rate": 1.0672101121508205e-06, + "loss": 0.5888, + "step": 11311 + }, + { + "epoch": 4.175732217573222, + "grad_norm": 0.9551934003829956, + "learning_rate": 1.0668115675585242e-06, + "loss": 0.5635, + "step": 11312 + }, + { + "epoch": 4.176101402904258, + "grad_norm": 0.9397962689399719, + "learning_rate": 1.0664130772119594e-06, + "loss": 0.5712, + "step": 11313 + }, + { + "epoch": 4.176470588235294, + "grad_norm": 0.9536576271057129, + "learning_rate": 1.0660146411262055e-06, + "loss": 0.5433, + "step": 11314 + }, + { + "epoch": 4.17683977356633, + "grad_norm": 0.9314981698989868, + "learning_rate": 1.0656162593163446e-06, + "loss": 0.5875, + "step": 11315 + }, + { + "epoch": 4.177208958897366, + "grad_norm": 0.9382596611976624, + "learning_rate": 1.0652179317974567e-06, + "loss": 0.5528, + "step": 11316 + }, + { + "epoch": 4.177578144228403, + "grad_norm": 0.9160748720169067, + "learning_rate": 1.0648196585846174e-06, + "loss": 0.5706, + "step": 11317 + }, + { + "epoch": 4.177947329559439, + "grad_norm": 0.9623132348060608, + "learning_rate": 1.064421439692901e-06, + "loss": 0.5693, + "step": 11318 + }, + { + "epoch": 4.178316514890475, + "grad_norm": 0.928764283657074, + "learning_rate": 1.0640232751373792e-06, + "loss": 0.5126, + "step": 11319 + }, + { + "epoch": 4.178685700221511, + "grad_norm": 0.9436389803886414, + "learning_rate": 1.063625164933124e-06, + "loss": 0.5917, + "step": 11320 + }, + { + "epoch": 4.179054885552548, + "grad_norm": 1.0091851949691772, + "learning_rate": 1.063227109095203e-06, + "loss": 0.5418, + "step": 11321 + }, + { + "epoch": 4.1794240708835835, + "grad_norm": 0.9447587728500366, + "learning_rate": 1.0628291076386815e-06, + "loss": 0.5763, + "step": 11322 + }, + { + "epoch": 4.179793256214619, + "grad_norm": 0.9538701772689819, + "learning_rate": 1.0624311605786255e-06, + "loss": 0.5749, + "step": 11323 + }, + { + "epoch": 4.180162441545656, + "grad_norm": 0.931213915348053, + "learning_rate": 1.0620332679300958e-06, + "loss": 0.5593, + "step": 11324 + }, + { + "epoch": 4.180531626876692, + "grad_norm": 0.9526000618934631, + "learning_rate": 1.0616354297081539e-06, + "loss": 0.5396, + "step": 11325 + }, + { + "epoch": 4.180900812207728, + "grad_norm": 0.9562174081802368, + "learning_rate": 1.0612376459278568e-06, + "loss": 0.5881, + "step": 11326 + }, + { + "epoch": 4.181269997538765, + "grad_norm": 0.962570071220398, + "learning_rate": 1.06083991660426e-06, + "loss": 0.5881, + "step": 11327 + }, + { + "epoch": 4.181639182869801, + "grad_norm": 0.9083138704299927, + "learning_rate": 1.0604422417524188e-06, + "loss": 0.5499, + "step": 11328 + }, + { + "epoch": 4.182008368200837, + "grad_norm": 0.9345563650131226, + "learning_rate": 1.0600446213873846e-06, + "loss": 0.5548, + "step": 11329 + }, + { + "epoch": 4.182377553531873, + "grad_norm": 0.9163976907730103, + "learning_rate": 1.0596470555242064e-06, + "loss": 0.548, + "step": 11330 + }, + { + "epoch": 4.182746738862909, + "grad_norm": 0.9052497744560242, + "learning_rate": 1.059249544177933e-06, + "loss": 0.5905, + "step": 11331 + }, + { + "epoch": 4.183115924193945, + "grad_norm": 1.0020447969436646, + "learning_rate": 1.058852087363609e-06, + "loss": 0.5724, + "step": 11332 + }, + { + "epoch": 4.183485109524981, + "grad_norm": 0.9364806413650513, + "learning_rate": 1.0584546850962805e-06, + "loss": 0.5703, + "step": 11333 + }, + { + "epoch": 4.183854294856018, + "grad_norm": 0.9468004107475281, + "learning_rate": 1.0580573373909853e-06, + "loss": 0.5522, + "step": 11334 + }, + { + "epoch": 4.184223480187054, + "grad_norm": 0.9194872975349426, + "learning_rate": 1.0576600442627648e-06, + "loss": 0.5501, + "step": 11335 + }, + { + "epoch": 4.18459266551809, + "grad_norm": 0.9401524066925049, + "learning_rate": 1.0572628057266574e-06, + "loss": 0.5758, + "step": 11336 + }, + { + "epoch": 4.184961850849127, + "grad_norm": 0.9345608353614807, + "learning_rate": 1.0568656217976974e-06, + "loss": 0.531, + "step": 11337 + }, + { + "epoch": 4.185331036180163, + "grad_norm": 0.9553866982460022, + "learning_rate": 1.056468492490918e-06, + "loss": 0.5695, + "step": 11338 + }, + { + "epoch": 4.1857002215111985, + "grad_norm": 0.9413250088691711, + "learning_rate": 1.05607141782135e-06, + "loss": 0.5357, + "step": 11339 + }, + { + "epoch": 4.186069406842234, + "grad_norm": 0.9214651584625244, + "learning_rate": 1.0556743978040237e-06, + "loss": 0.5776, + "step": 11340 + }, + { + "epoch": 4.186438592173271, + "grad_norm": 0.918433427810669, + "learning_rate": 1.0552774324539658e-06, + "loss": 0.5706, + "step": 11341 + }, + { + "epoch": 4.186807777504307, + "grad_norm": 0.978976309299469, + "learning_rate": 1.0548805217862001e-06, + "loss": 0.5493, + "step": 11342 + }, + { + "epoch": 4.187176962835343, + "grad_norm": 0.9537103176116943, + "learning_rate": 1.054483665815751e-06, + "loss": 0.5505, + "step": 11343 + }, + { + "epoch": 4.18754614816638, + "grad_norm": 0.9146604537963867, + "learning_rate": 1.0540868645576393e-06, + "loss": 0.5548, + "step": 11344 + }, + { + "epoch": 4.187915333497416, + "grad_norm": 0.9341455698013306, + "learning_rate": 1.0536901180268835e-06, + "loss": 0.5645, + "step": 11345 + }, + { + "epoch": 4.188284518828452, + "grad_norm": 0.9923014044761658, + "learning_rate": 1.0532934262385006e-06, + "loss": 0.5697, + "step": 11346 + }, + { + "epoch": 4.1886537041594885, + "grad_norm": 0.9773287177085876, + "learning_rate": 1.0528967892075038e-06, + "loss": 0.5827, + "step": 11347 + }, + { + "epoch": 4.189022889490524, + "grad_norm": 0.973751425743103, + "learning_rate": 1.0525002069489077e-06, + "loss": 0.5654, + "step": 11348 + }, + { + "epoch": 4.18939207482156, + "grad_norm": 0.9718360304832458, + "learning_rate": 1.052103679477722e-06, + "loss": 0.5773, + "step": 11349 + }, + { + "epoch": 4.189761260152596, + "grad_norm": 0.9544544816017151, + "learning_rate": 1.051707206808954e-06, + "loss": 0.596, + "step": 11350 + }, + { + "epoch": 4.190130445483633, + "grad_norm": 1.0059216022491455, + "learning_rate": 1.0513107889576124e-06, + "loss": 0.5691, + "step": 11351 + }, + { + "epoch": 4.190499630814669, + "grad_norm": 0.9667405486106873, + "learning_rate": 1.050914425938699e-06, + "loss": 0.5512, + "step": 11352 + }, + { + "epoch": 4.190868816145705, + "grad_norm": 0.9653061032295227, + "learning_rate": 1.0505181177672182e-06, + "loss": 0.5685, + "step": 11353 + }, + { + "epoch": 4.191238001476742, + "grad_norm": 1.0109589099884033, + "learning_rate": 1.0501218644581692e-06, + "loss": 0.6025, + "step": 11354 + }, + { + "epoch": 4.1916071868077776, + "grad_norm": 0.9616069197654724, + "learning_rate": 1.0497256660265492e-06, + "loss": 0.5789, + "step": 11355 + }, + { + "epoch": 4.1919763721388135, + "grad_norm": 0.9447638392448425, + "learning_rate": 1.0493295224873559e-06, + "loss": 0.5974, + "step": 11356 + }, + { + "epoch": 4.19234555746985, + "grad_norm": 0.9394352436065674, + "learning_rate": 1.0489334338555823e-06, + "loss": 0.5843, + "step": 11357 + }, + { + "epoch": 4.192714742800886, + "grad_norm": 0.9677183628082275, + "learning_rate": 1.0485374001462194e-06, + "loss": 0.6119, + "step": 11358 + }, + { + "epoch": 4.193083928131922, + "grad_norm": 0.9857876896858215, + "learning_rate": 1.0481414213742587e-06, + "loss": 0.5696, + "step": 11359 + }, + { + "epoch": 4.193453113462958, + "grad_norm": 0.9289847016334534, + "learning_rate": 1.0477454975546869e-06, + "loss": 0.5644, + "step": 11360 + }, + { + "epoch": 4.193822298793995, + "grad_norm": 1.039652943611145, + "learning_rate": 1.0473496287024899e-06, + "loss": 0.5262, + "step": 11361 + }, + { + "epoch": 4.194191484125031, + "grad_norm": 0.9389050006866455, + "learning_rate": 1.04695381483265e-06, + "loss": 0.6074, + "step": 11362 + }, + { + "epoch": 4.194560669456067, + "grad_norm": 0.9640913605690002, + "learning_rate": 1.0465580559601496e-06, + "loss": 0.5645, + "step": 11363 + }, + { + "epoch": 4.1949298547871035, + "grad_norm": 0.9513023495674133, + "learning_rate": 1.046162352099969e-06, + "loss": 0.5852, + "step": 11364 + }, + { + "epoch": 4.195299040118139, + "grad_norm": 0.9572755694389343, + "learning_rate": 1.0457667032670843e-06, + "loss": 0.5467, + "step": 11365 + }, + { + "epoch": 4.195668225449175, + "grad_norm": 0.9763917922973633, + "learning_rate": 1.0453711094764712e-06, + "loss": 0.5897, + "step": 11366 + }, + { + "epoch": 4.196037410780212, + "grad_norm": 0.9466323256492615, + "learning_rate": 1.0449755707431014e-06, + "loss": 0.5698, + "step": 11367 + }, + { + "epoch": 4.196406596111248, + "grad_norm": 0.9653472900390625, + "learning_rate": 1.0445800870819481e-06, + "loss": 0.5677, + "step": 11368 + }, + { + "epoch": 4.196775781442284, + "grad_norm": 0.9684056043624878, + "learning_rate": 1.0441846585079793e-06, + "loss": 0.5618, + "step": 11369 + }, + { + "epoch": 4.19714496677332, + "grad_norm": 0.9474698305130005, + "learning_rate": 1.0437892850361605e-06, + "loss": 0.5673, + "step": 11370 + }, + { + "epoch": 4.197514152104357, + "grad_norm": 0.9379299283027649, + "learning_rate": 1.0433939666814587e-06, + "loss": 0.5359, + "step": 11371 + }, + { + "epoch": 4.1978833374353925, + "grad_norm": 1.2624207735061646, + "learning_rate": 1.0429987034588346e-06, + "loss": 0.5773, + "step": 11372 + }, + { + "epoch": 4.1982525227664285, + "grad_norm": 0.963800311088562, + "learning_rate": 1.042603495383251e-06, + "loss": 0.5345, + "step": 11373 + }, + { + "epoch": 4.198621708097465, + "grad_norm": 0.9545558094978333, + "learning_rate": 1.0422083424696647e-06, + "loss": 0.5757, + "step": 11374 + }, + { + "epoch": 4.198990893428501, + "grad_norm": 0.9655184149742126, + "learning_rate": 1.0418132447330317e-06, + "loss": 0.5747, + "step": 11375 + }, + { + "epoch": 4.199360078759537, + "grad_norm": 0.9788278937339783, + "learning_rate": 1.0414182021883082e-06, + "loss": 0.5944, + "step": 11376 + }, + { + "epoch": 4.199729264090573, + "grad_norm": 0.9794960618019104, + "learning_rate": 1.0410232148504454e-06, + "loss": 0.5724, + "step": 11377 + }, + { + "epoch": 4.20009844942161, + "grad_norm": 0.9682853817939758, + "learning_rate": 1.0406282827343925e-06, + "loss": 0.5565, + "step": 11378 + }, + { + "epoch": 4.200467634752646, + "grad_norm": 0.9497349858283997, + "learning_rate": 1.0402334058550994e-06, + "loss": 0.5646, + "step": 11379 + }, + { + "epoch": 4.200836820083682, + "grad_norm": 0.9593395590782166, + "learning_rate": 1.0398385842275103e-06, + "loss": 0.562, + "step": 11380 + }, + { + "epoch": 4.2012060054147184, + "grad_norm": 0.9763898849487305, + "learning_rate": 1.0394438178665708e-06, + "loss": 0.5964, + "step": 11381 + }, + { + "epoch": 4.201575190745754, + "grad_norm": 0.979760468006134, + "learning_rate": 1.0390491067872222e-06, + "loss": 0.584, + "step": 11382 + }, + { + "epoch": 4.20194437607679, + "grad_norm": 0.9449560046195984, + "learning_rate": 1.0386544510044024e-06, + "loss": 0.5546, + "step": 11383 + }, + { + "epoch": 4.202313561407827, + "grad_norm": 0.9410781264305115, + "learning_rate": 1.0382598505330515e-06, + "loss": 0.5654, + "step": 11384 + }, + { + "epoch": 4.202682746738863, + "grad_norm": 0.9429764747619629, + "learning_rate": 1.037865305388104e-06, + "loss": 0.5563, + "step": 11385 + }, + { + "epoch": 4.203051932069899, + "grad_norm": 0.9861410856246948, + "learning_rate": 1.0374708155844923e-06, + "loss": 0.5708, + "step": 11386 + }, + { + "epoch": 4.203421117400936, + "grad_norm": 0.9435864090919495, + "learning_rate": 1.0370763811371498e-06, + "loss": 0.5912, + "step": 11387 + }, + { + "epoch": 4.203790302731972, + "grad_norm": 0.9479159116744995, + "learning_rate": 1.0366820020610045e-06, + "loss": 0.5929, + "step": 11388 + }, + { + "epoch": 4.2041594880630075, + "grad_norm": 0.9251338839530945, + "learning_rate": 1.0362876783709832e-06, + "loss": 0.5708, + "step": 11389 + }, + { + "epoch": 4.2045286733940435, + "grad_norm": 0.9229128956794739, + "learning_rate": 1.0358934100820107e-06, + "loss": 0.5805, + "step": 11390 + }, + { + "epoch": 4.20489785872508, + "grad_norm": 0.9539967775344849, + "learning_rate": 1.0354991972090103e-06, + "loss": 0.5983, + "step": 11391 + }, + { + "epoch": 4.205267044056116, + "grad_norm": 0.9312884211540222, + "learning_rate": 1.0351050397669043e-06, + "loss": 0.5744, + "step": 11392 + }, + { + "epoch": 4.205636229387152, + "grad_norm": 0.9467967748641968, + "learning_rate": 1.0347109377706102e-06, + "loss": 0.5983, + "step": 11393 + }, + { + "epoch": 4.206005414718189, + "grad_norm": 0.928726315498352, + "learning_rate": 1.0343168912350446e-06, + "loss": 0.5695, + "step": 11394 + }, + { + "epoch": 4.206374600049225, + "grad_norm": 0.9353914856910706, + "learning_rate": 1.0339229001751211e-06, + "loss": 0.5876, + "step": 11395 + }, + { + "epoch": 4.206743785380261, + "grad_norm": 0.9529939293861389, + "learning_rate": 1.0335289646057539e-06, + "loss": 0.5545, + "step": 11396 + }, + { + "epoch": 4.207112970711297, + "grad_norm": 0.9358698725700378, + "learning_rate": 1.0331350845418528e-06, + "loss": 0.5452, + "step": 11397 + }, + { + "epoch": 4.207482156042333, + "grad_norm": 0.9265876412391663, + "learning_rate": 1.032741259998325e-06, + "loss": 0.5619, + "step": 11398 + }, + { + "epoch": 4.207851341373369, + "grad_norm": 0.9203841686248779, + "learning_rate": 1.032347490990078e-06, + "loss": 0.5835, + "step": 11399 + }, + { + "epoch": 4.208220526704405, + "grad_norm": 0.9715249538421631, + "learning_rate": 1.0319537775320146e-06, + "loss": 0.5849, + "step": 11400 + }, + { + "epoch": 4.208589712035442, + "grad_norm": 0.9789806008338928, + "learning_rate": 1.0315601196390381e-06, + "loss": 0.5812, + "step": 11401 + }, + { + "epoch": 4.208958897366478, + "grad_norm": 0.9586224555969238, + "learning_rate": 1.031166517326048e-06, + "loss": 0.5837, + "step": 11402 + }, + { + "epoch": 4.209328082697514, + "grad_norm": 0.9987069964408875, + "learning_rate": 1.0307729706079403e-06, + "loss": 0.5736, + "step": 11403 + }, + { + "epoch": 4.209697268028551, + "grad_norm": 0.9476292133331299, + "learning_rate": 1.030379479499613e-06, + "loss": 0.5726, + "step": 11404 + }, + { + "epoch": 4.210066453359587, + "grad_norm": 0.9566808938980103, + "learning_rate": 1.0299860440159586e-06, + "loss": 0.6144, + "step": 11405 + }, + { + "epoch": 4.2104356386906225, + "grad_norm": 0.9205222129821777, + "learning_rate": 1.0295926641718678e-06, + "loss": 0.5702, + "step": 11406 + }, + { + "epoch": 4.2108048240216585, + "grad_norm": 0.9711706042289734, + "learning_rate": 1.0291993399822313e-06, + "loss": 0.5787, + "step": 11407 + }, + { + "epoch": 4.211174009352695, + "grad_norm": 0.9465502500534058, + "learning_rate": 1.0288060714619359e-06, + "loss": 0.5611, + "step": 11408 + }, + { + "epoch": 4.211543194683731, + "grad_norm": 0.9208208322525024, + "learning_rate": 1.028412858625866e-06, + "loss": 0.5661, + "step": 11409 + }, + { + "epoch": 4.211912380014767, + "grad_norm": 0.9323182702064514, + "learning_rate": 1.028019701488904e-06, + "loss": 0.5586, + "step": 11410 + }, + { + "epoch": 4.212281565345804, + "grad_norm": 0.9055789709091187, + "learning_rate": 1.027626600065932e-06, + "loss": 0.5723, + "step": 11411 + }, + { + "epoch": 4.21265075067684, + "grad_norm": 0.9372318983078003, + "learning_rate": 1.0272335543718293e-06, + "loss": 0.539, + "step": 11412 + }, + { + "epoch": 4.213019936007876, + "grad_norm": 0.9594363570213318, + "learning_rate": 1.0268405644214718e-06, + "loss": 0.5894, + "step": 11413 + }, + { + "epoch": 4.2133891213389125, + "grad_norm": 0.9291462302207947, + "learning_rate": 1.0264476302297338e-06, + "loss": 0.5827, + "step": 11414 + }, + { + "epoch": 4.213758306669948, + "grad_norm": 0.9305471777915955, + "learning_rate": 1.0260547518114875e-06, + "loss": 0.5487, + "step": 11415 + }, + { + "epoch": 4.214127492000984, + "grad_norm": 0.9398887753486633, + "learning_rate": 1.0256619291816041e-06, + "loss": 0.5729, + "step": 11416 + }, + { + "epoch": 4.21449667733202, + "grad_norm": 0.941040575504303, + "learning_rate": 1.0252691623549516e-06, + "loss": 0.6171, + "step": 11417 + }, + { + "epoch": 4.214865862663057, + "grad_norm": 0.9681441783905029, + "learning_rate": 1.024876451346395e-06, + "loss": 0.5682, + "step": 11418 + }, + { + "epoch": 4.215235047994093, + "grad_norm": 0.9636114835739136, + "learning_rate": 1.0244837961707987e-06, + "loss": 0.5787, + "step": 11419 + }, + { + "epoch": 4.215604233325129, + "grad_norm": 0.9739170074462891, + "learning_rate": 1.0240911968430264e-06, + "loss": 0.5662, + "step": 11420 + }, + { + "epoch": 4.215973418656166, + "grad_norm": 0.9531899690628052, + "learning_rate": 1.0236986533779365e-06, + "loss": 0.5971, + "step": 11421 + }, + { + "epoch": 4.216342603987202, + "grad_norm": 0.9362071752548218, + "learning_rate": 1.0233061657903864e-06, + "loss": 0.589, + "step": 11422 + }, + { + "epoch": 4.2167117893182375, + "grad_norm": 0.9710529446601868, + "learning_rate": 1.022913734095231e-06, + "loss": 0.5667, + "step": 11423 + }, + { + "epoch": 4.217080974649274, + "grad_norm": 0.931532621383667, + "learning_rate": 1.0225213583073257e-06, + "loss": 0.5911, + "step": 11424 + }, + { + "epoch": 4.21745015998031, + "grad_norm": 0.9991431832313538, + "learning_rate": 1.0221290384415206e-06, + "loss": 0.5771, + "step": 11425 + }, + { + "epoch": 4.217819345311346, + "grad_norm": 0.9214730858802795, + "learning_rate": 1.021736774512664e-06, + "loss": 0.5749, + "step": 11426 + }, + { + "epoch": 4.218188530642382, + "grad_norm": 0.9527456164360046, + "learning_rate": 1.0213445665356051e-06, + "loss": 0.6075, + "step": 11427 + }, + { + "epoch": 4.218557715973419, + "grad_norm": 0.9679713845252991, + "learning_rate": 1.0209524145251873e-06, + "loss": 0.5748, + "step": 11428 + }, + { + "epoch": 4.218926901304455, + "grad_norm": 0.9504753351211548, + "learning_rate": 1.0205603184962532e-06, + "loss": 0.5859, + "step": 11429 + }, + { + "epoch": 4.219296086635491, + "grad_norm": 0.9627165198326111, + "learning_rate": 1.020168278463645e-06, + "loss": 0.6105, + "step": 11430 + }, + { + "epoch": 4.2196652719665275, + "grad_norm": 0.9656411409378052, + "learning_rate": 1.0197762944421994e-06, + "loss": 0.5847, + "step": 11431 + }, + { + "epoch": 4.220034457297563, + "grad_norm": 0.9645059704780579, + "learning_rate": 1.0193843664467548e-06, + "loss": 0.5854, + "step": 11432 + }, + { + "epoch": 4.220403642628599, + "grad_norm": 0.9599614143371582, + "learning_rate": 1.0189924944921448e-06, + "loss": 0.5193, + "step": 11433 + }, + { + "epoch": 4.220772827959636, + "grad_norm": 0.9545050263404846, + "learning_rate": 1.0186006785932005e-06, + "loss": 0.5799, + "step": 11434 + }, + { + "epoch": 4.221142013290672, + "grad_norm": 0.9544112086296082, + "learning_rate": 1.0182089187647537e-06, + "loss": 0.5223, + "step": 11435 + }, + { + "epoch": 4.221511198621708, + "grad_norm": 0.999262273311615, + "learning_rate": 1.0178172150216316e-06, + "loss": 0.5811, + "step": 11436 + }, + { + "epoch": 4.221880383952744, + "grad_norm": 0.9436172842979431, + "learning_rate": 1.0174255673786605e-06, + "loss": 0.5564, + "step": 11437 + }, + { + "epoch": 4.222249569283781, + "grad_norm": 0.9541794061660767, + "learning_rate": 1.0170339758506626e-06, + "loss": 0.5541, + "step": 11438 + }, + { + "epoch": 4.222618754614817, + "grad_norm": 0.9422008395195007, + "learning_rate": 1.0166424404524608e-06, + "loss": 0.5907, + "step": 11439 + }, + { + "epoch": 4.2229879399458525, + "grad_norm": 0.9571720361709595, + "learning_rate": 1.0162509611988753e-06, + "loss": 0.5878, + "step": 11440 + }, + { + "epoch": 4.223357125276889, + "grad_norm": 0.9516973495483398, + "learning_rate": 1.0158595381047226e-06, + "loss": 0.5676, + "step": 11441 + }, + { + "epoch": 4.223726310607925, + "grad_norm": 0.9500491619110107, + "learning_rate": 1.0154681711848183e-06, + "loss": 0.552, + "step": 11442 + }, + { + "epoch": 4.224095495938961, + "grad_norm": 0.921733021736145, + "learning_rate": 1.015076860453974e-06, + "loss": 0.5838, + "step": 11443 + }, + { + "epoch": 4.224464681269998, + "grad_norm": 0.9559779167175293, + "learning_rate": 1.0146856059270028e-06, + "loss": 0.5845, + "step": 11444 + }, + { + "epoch": 4.224833866601034, + "grad_norm": 0.9329501986503601, + "learning_rate": 1.0142944076187126e-06, + "loss": 0.5569, + "step": 11445 + }, + { + "epoch": 4.22520305193207, + "grad_norm": 0.9686665534973145, + "learning_rate": 1.0139032655439093e-06, + "loss": 0.5684, + "step": 11446 + }, + { + "epoch": 4.225572237263106, + "grad_norm": 0.9408827424049377, + "learning_rate": 1.0135121797173994e-06, + "loss": 0.5575, + "step": 11447 + }, + { + "epoch": 4.2259414225941425, + "grad_norm": 0.9487376809120178, + "learning_rate": 1.0131211501539842e-06, + "loss": 0.5846, + "step": 11448 + }, + { + "epoch": 4.226310607925178, + "grad_norm": 0.9788563251495361, + "learning_rate": 1.0127301768684636e-06, + "loss": 0.5704, + "step": 11449 + }, + { + "epoch": 4.226679793256214, + "grad_norm": 0.9549158215522766, + "learning_rate": 1.012339259875637e-06, + "loss": 0.578, + "step": 11450 + }, + { + "epoch": 4.227048978587251, + "grad_norm": 0.9330679178237915, + "learning_rate": 1.0119483991902992e-06, + "loss": 0.564, + "step": 11451 + }, + { + "epoch": 4.227418163918287, + "grad_norm": 0.9389236569404602, + "learning_rate": 1.0115575948272455e-06, + "loss": 0.5697, + "step": 11452 + }, + { + "epoch": 4.227787349249323, + "grad_norm": 0.9666774272918701, + "learning_rate": 1.0111668468012673e-06, + "loss": 0.5635, + "step": 11453 + }, + { + "epoch": 4.228156534580359, + "grad_norm": 0.9739336371421814, + "learning_rate": 1.0107761551271531e-06, + "loss": 0.5832, + "step": 11454 + }, + { + "epoch": 4.228525719911396, + "grad_norm": 0.9602746367454529, + "learning_rate": 1.0103855198196924e-06, + "loss": 0.6234, + "step": 11455 + }, + { + "epoch": 4.228894905242432, + "grad_norm": 0.982236921787262, + "learning_rate": 1.0099949408936697e-06, + "loss": 0.585, + "step": 11456 + }, + { + "epoch": 4.2292640905734675, + "grad_norm": 0.9737415909767151, + "learning_rate": 1.0096044183638671e-06, + "loss": 0.5538, + "step": 11457 + }, + { + "epoch": 4.229633275904504, + "grad_norm": 0.9331293106079102, + "learning_rate": 1.0092139522450678e-06, + "loss": 0.535, + "step": 11458 + }, + { + "epoch": 4.23000246123554, + "grad_norm": 0.9548097252845764, + "learning_rate": 1.008823542552049e-06, + "loss": 0.568, + "step": 11459 + }, + { + "epoch": 4.230371646566576, + "grad_norm": 0.9609190225601196, + "learning_rate": 1.0084331892995896e-06, + "loss": 0.5398, + "step": 11460 + }, + { + "epoch": 4.230740831897613, + "grad_norm": 0.9329116940498352, + "learning_rate": 1.0080428925024632e-06, + "loss": 0.5681, + "step": 11461 + }, + { + "epoch": 4.231110017228649, + "grad_norm": 0.9531049728393555, + "learning_rate": 1.0076526521754416e-06, + "loss": 0.584, + "step": 11462 + }, + { + "epoch": 4.231479202559685, + "grad_norm": 0.9621787667274475, + "learning_rate": 1.0072624683332968e-06, + "loss": 0.5881, + "step": 11463 + }, + { + "epoch": 4.231848387890722, + "grad_norm": 0.9699482321739197, + "learning_rate": 1.0068723409907965e-06, + "loss": 0.5766, + "step": 11464 + }, + { + "epoch": 4.2322175732217575, + "grad_norm": 0.9443783164024353, + "learning_rate": 1.0064822701627068e-06, + "loss": 0.5633, + "step": 11465 + }, + { + "epoch": 4.232586758552793, + "grad_norm": 0.9755722880363464, + "learning_rate": 1.0060922558637912e-06, + "loss": 0.5727, + "step": 11466 + }, + { + "epoch": 4.232955943883829, + "grad_norm": 0.9351140856742859, + "learning_rate": 1.0057022981088122e-06, + "loss": 0.5641, + "step": 11467 + }, + { + "epoch": 4.233325129214866, + "grad_norm": 0.962568461894989, + "learning_rate": 1.005312396912531e-06, + "loss": 0.5716, + "step": 11468 + }, + { + "epoch": 4.233694314545902, + "grad_norm": 0.9487777948379517, + "learning_rate": 1.0049225522897024e-06, + "loss": 0.5589, + "step": 11469 + }, + { + "epoch": 4.234063499876938, + "grad_norm": 0.9954864382743835, + "learning_rate": 1.0045327642550843e-06, + "loss": 0.5932, + "step": 11470 + }, + { + "epoch": 4.234432685207975, + "grad_norm": 0.955604612827301, + "learning_rate": 1.0041430328234281e-06, + "loss": 0.576, + "step": 11471 + }, + { + "epoch": 4.234801870539011, + "grad_norm": 0.9625194072723389, + "learning_rate": 1.0037533580094866e-06, + "loss": 0.5839, + "step": 11472 + }, + { + "epoch": 4.235171055870047, + "grad_norm": 0.9749653339385986, + "learning_rate": 1.0033637398280087e-06, + "loss": 0.5901, + "step": 11473 + }, + { + "epoch": 4.2355402412010825, + "grad_norm": 0.9582996964454651, + "learning_rate": 1.00297417829374e-06, + "loss": 0.5629, + "step": 11474 + }, + { + "epoch": 4.235909426532119, + "grad_norm": 0.9867292642593384, + "learning_rate": 1.0025846734214268e-06, + "loss": 0.5823, + "step": 11475 + }, + { + "epoch": 4.236278611863155, + "grad_norm": 0.9204971790313721, + "learning_rate": 1.0021952252258112e-06, + "loss": 0.583, + "step": 11476 + }, + { + "epoch": 4.236647797194191, + "grad_norm": 1.0204139947891235, + "learning_rate": 1.0018058337216327e-06, + "loss": 0.5721, + "step": 11477 + }, + { + "epoch": 4.237016982525228, + "grad_norm": 0.9505346417427063, + "learning_rate": 1.0014164989236316e-06, + "loss": 0.581, + "step": 11478 + }, + { + "epoch": 4.237386167856264, + "grad_norm": 0.9637829661369324, + "learning_rate": 1.0010272208465422e-06, + "loss": 0.5849, + "step": 11479 + }, + { + "epoch": 4.2377553531873, + "grad_norm": 0.9499833583831787, + "learning_rate": 1.0006379995051e-06, + "loss": 0.5995, + "step": 11480 + }, + { + "epoch": 4.238124538518337, + "grad_norm": 0.9502218961715698, + "learning_rate": 1.0002488349140365e-06, + "loss": 0.5769, + "step": 11481 + }, + { + "epoch": 4.2384937238493725, + "grad_norm": 0.9402387738227844, + "learning_rate": 9.998597270880803e-07, + "loss": 0.5491, + "step": 11482 + }, + { + "epoch": 4.238862909180408, + "grad_norm": 0.929897129535675, + "learning_rate": 9.99470676041961e-07, + "loss": 0.5801, + "step": 11483 + }, + { + "epoch": 4.239232094511444, + "grad_norm": 0.9539269208908081, + "learning_rate": 9.990816817904031e-07, + "loss": 0.5999, + "step": 11484 + }, + { + "epoch": 4.239601279842481, + "grad_norm": 0.9704883694648743, + "learning_rate": 9.986927443481295e-07, + "loss": 0.5542, + "step": 11485 + }, + { + "epoch": 4.239970465173517, + "grad_norm": 0.9165163636207581, + "learning_rate": 9.983038637298612e-07, + "loss": 0.5608, + "step": 11486 + }, + { + "epoch": 4.240339650504553, + "grad_norm": 0.9423311352729797, + "learning_rate": 9.979150399503176e-07, + "loss": 0.5839, + "step": 11487 + }, + { + "epoch": 4.24070883583559, + "grad_norm": 0.9676038026809692, + "learning_rate": 9.975262730242174e-07, + "loss": 0.5328, + "step": 11488 + }, + { + "epoch": 4.241078021166626, + "grad_norm": 0.9158011674880981, + "learning_rate": 9.971375629662717e-07, + "loss": 0.5764, + "step": 11489 + }, + { + "epoch": 4.241447206497662, + "grad_norm": 0.9488062262535095, + "learning_rate": 9.96748909791195e-07, + "loss": 0.6027, + "step": 11490 + }, + { + "epoch": 4.241816391828698, + "grad_norm": 0.9927535653114319, + "learning_rate": 9.963603135136986e-07, + "loss": 0.5888, + "step": 11491 + }, + { + "epoch": 4.242185577159734, + "grad_norm": 0.9357973337173462, + "learning_rate": 9.959717741484897e-07, + "loss": 0.5644, + "step": 11492 + }, + { + "epoch": 4.24255476249077, + "grad_norm": 0.9334299564361572, + "learning_rate": 9.955832917102745e-07, + "loss": 0.5542, + "step": 11493 + }, + { + "epoch": 4.242923947821806, + "grad_norm": 0.9483194947242737, + "learning_rate": 9.951948662137562e-07, + "loss": 0.5265, + "step": 11494 + }, + { + "epoch": 4.243293133152843, + "grad_norm": 0.9238734245300293, + "learning_rate": 9.948064976736372e-07, + "loss": 0.5569, + "step": 11495 + }, + { + "epoch": 4.243662318483879, + "grad_norm": 0.9300233125686646, + "learning_rate": 9.944181861046188e-07, + "loss": 0.5906, + "step": 11496 + }, + { + "epoch": 4.244031503814915, + "grad_norm": 0.9309622049331665, + "learning_rate": 9.940299315213953e-07, + "loss": 0.5591, + "step": 11497 + }, + { + "epoch": 4.244400689145952, + "grad_norm": 0.9777302742004395, + "learning_rate": 9.936417339386641e-07, + "loss": 0.5919, + "step": 11498 + }, + { + "epoch": 4.2447698744769875, + "grad_norm": 0.9429308176040649, + "learning_rate": 9.932535933711173e-07, + "loss": 0.5551, + "step": 11499 + }, + { + "epoch": 4.245139059808023, + "grad_norm": 0.9317396283149719, + "learning_rate": 9.928655098334476e-07, + "loss": 0.5623, + "step": 11500 + }, + { + "epoch": 4.24550824513906, + "grad_norm": 0.9487853050231934, + "learning_rate": 9.924774833403423e-07, + "loss": 0.5842, + "step": 11501 + }, + { + "epoch": 4.245877430470096, + "grad_norm": 0.9477739334106445, + "learning_rate": 9.920895139064876e-07, + "loss": 0.5952, + "step": 11502 + }, + { + "epoch": 4.246246615801132, + "grad_norm": 0.957338809967041, + "learning_rate": 9.9170160154657e-07, + "loss": 0.5765, + "step": 11503 + }, + { + "epoch": 4.246615801132168, + "grad_norm": 0.934786856174469, + "learning_rate": 9.913137462752707e-07, + "loss": 0.5684, + "step": 11504 + }, + { + "epoch": 4.246984986463205, + "grad_norm": 0.97487872838974, + "learning_rate": 9.90925948107269e-07, + "loss": 0.5882, + "step": 11505 + }, + { + "epoch": 4.247354171794241, + "grad_norm": 0.9533730149269104, + "learning_rate": 9.905382070572447e-07, + "loss": 0.5644, + "step": 11506 + }, + { + "epoch": 4.247723357125277, + "grad_norm": 0.9738190770149231, + "learning_rate": 9.901505231398719e-07, + "loss": 0.6152, + "step": 11507 + }, + { + "epoch": 4.248092542456313, + "grad_norm": 0.9488312602043152, + "learning_rate": 9.897628963698274e-07, + "loss": 0.6149, + "step": 11508 + }, + { + "epoch": 4.248461727787349, + "grad_norm": 0.9317591786384583, + "learning_rate": 9.893753267617787e-07, + "loss": 0.5447, + "step": 11509 + }, + { + "epoch": 4.248830913118385, + "grad_norm": 0.9441948533058167, + "learning_rate": 9.88987814330397e-07, + "loss": 0.5587, + "step": 11510 + }, + { + "epoch": 4.249200098449422, + "grad_norm": 0.9397814869880676, + "learning_rate": 9.886003590903505e-07, + "loss": 0.5786, + "step": 11511 + }, + { + "epoch": 4.249569283780458, + "grad_norm": 0.9588726758956909, + "learning_rate": 9.882129610563038e-07, + "loss": 0.5944, + "step": 11512 + }, + { + "epoch": 4.249938469111494, + "grad_norm": 0.9550454616546631, + "learning_rate": 9.87825620242919e-07, + "loss": 0.5948, + "step": 11513 + }, + { + "epoch": 4.25030765444253, + "grad_norm": 0.9453098177909851, + "learning_rate": 9.874383366648565e-07, + "loss": 0.5993, + "step": 11514 + }, + { + "epoch": 4.250676839773567, + "grad_norm": 0.9576504826545715, + "learning_rate": 9.870511103367756e-07, + "loss": 0.5825, + "step": 11515 + }, + { + "epoch": 4.2510460251046025, + "grad_norm": 0.9433522820472717, + "learning_rate": 9.866639412733345e-07, + "loss": 0.5738, + "step": 11516 + }, + { + "epoch": 4.251415210435638, + "grad_norm": 0.9949995279312134, + "learning_rate": 9.862768294891837e-07, + "loss": 0.5593, + "step": 11517 + }, + { + "epoch": 4.251784395766675, + "grad_norm": 0.9374171495437622, + "learning_rate": 9.858897749989779e-07, + "loss": 0.5809, + "step": 11518 + }, + { + "epoch": 4.252153581097711, + "grad_norm": 0.9597029685974121, + "learning_rate": 9.855027778173656e-07, + "loss": 0.5614, + "step": 11519 + }, + { + "epoch": 4.252522766428747, + "grad_norm": 0.956619918346405, + "learning_rate": 9.851158379589962e-07, + "loss": 0.5653, + "step": 11520 + }, + { + "epoch": 4.252891951759784, + "grad_norm": 0.9421029090881348, + "learning_rate": 9.84728955438514e-07, + "loss": 0.5464, + "step": 11521 + }, + { + "epoch": 4.25326113709082, + "grad_norm": 0.961851179599762, + "learning_rate": 9.84342130270562e-07, + "loss": 0.5374, + "step": 11522 + }, + { + "epoch": 4.253630322421856, + "grad_norm": 0.9352585673332214, + "learning_rate": 9.839553624697826e-07, + "loss": 0.5695, + "step": 11523 + }, + { + "epoch": 4.253999507752892, + "grad_norm": 0.9426154494285583, + "learning_rate": 9.835686520508141e-07, + "loss": 0.604, + "step": 11524 + }, + { + "epoch": 4.254368693083928, + "grad_norm": 0.9935364723205566, + "learning_rate": 9.831819990282932e-07, + "loss": 0.5762, + "step": 11525 + }, + { + "epoch": 4.254737878414964, + "grad_norm": 0.9792397618293762, + "learning_rate": 9.827954034168555e-07, + "loss": 0.581, + "step": 11526 + }, + { + "epoch": 4.255107063746, + "grad_norm": 0.9855045080184937, + "learning_rate": 9.82408865231132e-07, + "loss": 0.5951, + "step": 11527 + }, + { + "epoch": 4.255476249077037, + "grad_norm": 0.9320459365844727, + "learning_rate": 9.82022384485755e-07, + "loss": 0.5769, + "step": 11528 + }, + { + "epoch": 4.255845434408073, + "grad_norm": 0.9569157361984253, + "learning_rate": 9.816359611953514e-07, + "loss": 0.5293, + "step": 11529 + }, + { + "epoch": 4.256214619739109, + "grad_norm": 0.9588131904602051, + "learning_rate": 9.812495953745468e-07, + "loss": 0.5794, + "step": 11530 + }, + { + "epoch": 4.256583805070145, + "grad_norm": 0.9530490040779114, + "learning_rate": 9.808632870379667e-07, + "loss": 0.5484, + "step": 11531 + }, + { + "epoch": 4.256952990401182, + "grad_norm": 0.9307998418807983, + "learning_rate": 9.804770362002313e-07, + "loss": 0.5931, + "step": 11532 + }, + { + "epoch": 4.2573221757322175, + "grad_norm": 0.9676576852798462, + "learning_rate": 9.800908428759601e-07, + "loss": 0.5921, + "step": 11533 + }, + { + "epoch": 4.257691361063253, + "grad_norm": 0.9627931714057922, + "learning_rate": 9.797047070797716e-07, + "loss": 0.5871, + "step": 11534 + }, + { + "epoch": 4.25806054639429, + "grad_norm": 0.9333997964859009, + "learning_rate": 9.79318628826279e-07, + "loss": 0.5376, + "step": 11535 + }, + { + "epoch": 4.258429731725326, + "grad_norm": 0.9370325207710266, + "learning_rate": 9.789326081300985e-07, + "loss": 0.5609, + "step": 11536 + }, + { + "epoch": 4.258798917056362, + "grad_norm": 0.9214361310005188, + "learning_rate": 9.785466450058368e-07, + "loss": 0.5612, + "step": 11537 + }, + { + "epoch": 4.259168102387399, + "grad_norm": 0.960133969783783, + "learning_rate": 9.781607394681043e-07, + "loss": 0.5602, + "step": 11538 + }, + { + "epoch": 4.259537287718435, + "grad_norm": 0.9286371469497681, + "learning_rate": 9.777748915315084e-07, + "loss": 0.5345, + "step": 11539 + }, + { + "epoch": 4.259906473049471, + "grad_norm": 0.9330176711082458, + "learning_rate": 9.773891012106523e-07, + "loss": 0.5587, + "step": 11540 + }, + { + "epoch": 4.2602756583805075, + "grad_norm": 0.9845418930053711, + "learning_rate": 9.770033685201386e-07, + "loss": 0.5729, + "step": 11541 + }, + { + "epoch": 4.260644843711543, + "grad_norm": 0.9881420731544495, + "learning_rate": 9.766176934745655e-07, + "loss": 0.5768, + "step": 11542 + }, + { + "epoch": 4.261014029042579, + "grad_norm": 0.9222227334976196, + "learning_rate": 9.762320760885318e-07, + "loss": 0.5483, + "step": 11543 + }, + { + "epoch": 4.261383214373615, + "grad_norm": 0.9766771793365479, + "learning_rate": 9.758465163766353e-07, + "loss": 0.5849, + "step": 11544 + }, + { + "epoch": 4.261752399704652, + "grad_norm": 0.9419397711753845, + "learning_rate": 9.754610143534652e-07, + "loss": 0.631, + "step": 11545 + }, + { + "epoch": 4.262121585035688, + "grad_norm": 0.9599580764770508, + "learning_rate": 9.750755700336154e-07, + "loss": 0.6017, + "step": 11546 + }, + { + "epoch": 4.262490770366724, + "grad_norm": 0.9572346210479736, + "learning_rate": 9.74690183431673e-07, + "loss": 0.5435, + "step": 11547 + }, + { + "epoch": 4.262859955697761, + "grad_norm": 0.9299012422561646, + "learning_rate": 9.74304854562227e-07, + "loss": 0.6192, + "step": 11548 + }, + { + "epoch": 4.263229141028797, + "grad_norm": 0.9549832940101624, + "learning_rate": 9.739195834398605e-07, + "loss": 0.5721, + "step": 11549 + }, + { + "epoch": 4.2635983263598325, + "grad_norm": 0.9514803886413574, + "learning_rate": 9.73534370079155e-07, + "loss": 0.5653, + "step": 11550 + }, + { + "epoch": 4.263967511690868, + "grad_norm": 0.967563271522522, + "learning_rate": 9.73149214494693e-07, + "loss": 0.6077, + "step": 11551 + }, + { + "epoch": 4.264336697021905, + "grad_norm": 0.9485242366790771, + "learning_rate": 9.727641167010514e-07, + "loss": 0.5786, + "step": 11552 + }, + { + "epoch": 4.264705882352941, + "grad_norm": 0.992895781993866, + "learning_rate": 9.72379076712805e-07, + "loss": 0.5662, + "step": 11553 + }, + { + "epoch": 4.265075067683977, + "grad_norm": 0.9601638317108154, + "learning_rate": 9.719940945445295e-07, + "loss": 0.5531, + "step": 11554 + }, + { + "epoch": 4.265444253015014, + "grad_norm": 0.9329087734222412, + "learning_rate": 9.71609170210794e-07, + "loss": 0.5971, + "step": 11555 + }, + { + "epoch": 4.26581343834605, + "grad_norm": 0.9431681036949158, + "learning_rate": 9.71224303726171e-07, + "loss": 0.5604, + "step": 11556 + }, + { + "epoch": 4.266182623677086, + "grad_norm": 0.9304361939430237, + "learning_rate": 9.708394951052238e-07, + "loss": 0.6156, + "step": 11557 + }, + { + "epoch": 4.2665518090081225, + "grad_norm": 0.947626531124115, + "learning_rate": 9.704547443625193e-07, + "loss": 0.5597, + "step": 11558 + }, + { + "epoch": 4.266920994339158, + "grad_norm": 0.967276394367218, + "learning_rate": 9.700700515126207e-07, + "loss": 0.5995, + "step": 11559 + }, + { + "epoch": 4.267290179670194, + "grad_norm": 0.9421349167823792, + "learning_rate": 9.696854165700877e-07, + "loss": 0.5922, + "step": 11560 + }, + { + "epoch": 4.26765936500123, + "grad_norm": 0.9531494379043579, + "learning_rate": 9.693008395494791e-07, + "loss": 0.6039, + "step": 11561 + }, + { + "epoch": 4.268028550332267, + "grad_norm": 0.9600772857666016, + "learning_rate": 9.689163204653494e-07, + "loss": 0.5696, + "step": 11562 + }, + { + "epoch": 4.268397735663303, + "grad_norm": 0.9505480527877808, + "learning_rate": 9.685318593322537e-07, + "loss": 0.589, + "step": 11563 + }, + { + "epoch": 4.268766920994339, + "grad_norm": 0.9849680662155151, + "learning_rate": 9.681474561647459e-07, + "loss": 0.5643, + "step": 11564 + }, + { + "epoch": 4.269136106325376, + "grad_norm": 0.9423608779907227, + "learning_rate": 9.677631109773713e-07, + "loss": 0.5827, + "step": 11565 + }, + { + "epoch": 4.269505291656412, + "grad_norm": 0.9534035325050354, + "learning_rate": 9.673788237846797e-07, + "loss": 0.606, + "step": 11566 + }, + { + "epoch": 4.2698744769874475, + "grad_norm": 0.9839419722557068, + "learning_rate": 9.669945946012167e-07, + "loss": 0.5569, + "step": 11567 + }, + { + "epoch": 4.270243662318484, + "grad_norm": 0.9485743045806885, + "learning_rate": 9.666104234415242e-07, + "loss": 0.5828, + "step": 11568 + }, + { + "epoch": 4.27061284764952, + "grad_norm": 0.9213816523551941, + "learning_rate": 9.662263103201434e-07, + "loss": 0.5625, + "step": 11569 + }, + { + "epoch": 4.270982032980556, + "grad_norm": 0.953281044960022, + "learning_rate": 9.65842255251612e-07, + "loss": 0.5836, + "step": 11570 + }, + { + "epoch": 4.271351218311592, + "grad_norm": 0.999312162399292, + "learning_rate": 9.654582582504675e-07, + "loss": 0.5695, + "step": 11571 + }, + { + "epoch": 4.271720403642629, + "grad_norm": 0.9677693843841553, + "learning_rate": 9.650743193312439e-07, + "loss": 0.557, + "step": 11572 + }, + { + "epoch": 4.272089588973665, + "grad_norm": 0.9450191259384155, + "learning_rate": 9.64690438508472e-07, + "loss": 0.5201, + "step": 11573 + }, + { + "epoch": 4.272458774304701, + "grad_norm": 0.9316368699073792, + "learning_rate": 9.643066157966832e-07, + "loss": 0.5811, + "step": 11574 + }, + { + "epoch": 4.2728279596357375, + "grad_norm": 0.9498394727706909, + "learning_rate": 9.639228512104038e-07, + "loss": 0.5911, + "step": 11575 + }, + { + "epoch": 4.273197144966773, + "grad_norm": 0.9766371846199036, + "learning_rate": 9.635391447641602e-07, + "loss": 0.5388, + "step": 11576 + }, + { + "epoch": 4.273566330297809, + "grad_norm": 0.9250172972679138, + "learning_rate": 9.63155496472475e-07, + "loss": 0.587, + "step": 11577 + }, + { + "epoch": 4.273935515628846, + "grad_norm": 0.9706646203994751, + "learning_rate": 9.627719063498685e-07, + "loss": 0.5915, + "step": 11578 + }, + { + "epoch": 4.274304700959882, + "grad_norm": 0.9472401142120361, + "learning_rate": 9.623883744108614e-07, + "loss": 0.5684, + "step": 11579 + }, + { + "epoch": 4.274673886290918, + "grad_norm": 0.9405657649040222, + "learning_rate": 9.620049006699686e-07, + "loss": 0.5307, + "step": 11580 + }, + { + "epoch": 4.275043071621954, + "grad_norm": 0.90965336561203, + "learning_rate": 9.61621485141704e-07, + "loss": 0.5739, + "step": 11581 + }, + { + "epoch": 4.275412256952991, + "grad_norm": 0.9719299077987671, + "learning_rate": 9.612381278405819e-07, + "loss": 0.5913, + "step": 11582 + }, + { + "epoch": 4.275781442284027, + "grad_norm": 0.958766758441925, + "learning_rate": 9.6085482878111e-07, + "loss": 0.575, + "step": 11583 + }, + { + "epoch": 4.2761506276150625, + "grad_norm": 0.9583850502967834, + "learning_rate": 9.604715879777986e-07, + "loss": 0.5491, + "step": 11584 + }, + { + "epoch": 4.276519812946099, + "grad_norm": 0.9560595750808716, + "learning_rate": 9.600884054451504e-07, + "loss": 0.5848, + "step": 11585 + }, + { + "epoch": 4.276888998277135, + "grad_norm": 0.9443165063858032, + "learning_rate": 9.597052811976698e-07, + "loss": 0.5926, + "step": 11586 + }, + { + "epoch": 4.277258183608171, + "grad_norm": 0.9713648557662964, + "learning_rate": 9.593222152498587e-07, + "loss": 0.6027, + "step": 11587 + }, + { + "epoch": 4.277627368939207, + "grad_norm": 0.9273136258125305, + "learning_rate": 9.58939207616216e-07, + "loss": 0.5683, + "step": 11588 + }, + { + "epoch": 4.277996554270244, + "grad_norm": 0.9335790872573853, + "learning_rate": 9.58556258311238e-07, + "loss": 0.5909, + "step": 11589 + }, + { + "epoch": 4.27836573960128, + "grad_norm": 0.985508143901825, + "learning_rate": 9.581733673494179e-07, + "loss": 0.5995, + "step": 11590 + }, + { + "epoch": 4.278734924932316, + "grad_norm": 0.9519031047821045, + "learning_rate": 9.577905347452494e-07, + "loss": 0.5752, + "step": 11591 + }, + { + "epoch": 4.2791041102633525, + "grad_norm": 0.9839320182800293, + "learning_rate": 9.574077605132242e-07, + "loss": 0.6172, + "step": 11592 + }, + { + "epoch": 4.279473295594388, + "grad_norm": 0.9980006217956543, + "learning_rate": 9.570250446678267e-07, + "loss": 0.5527, + "step": 11593 + }, + { + "epoch": 4.279842480925424, + "grad_norm": 0.9411236047744751, + "learning_rate": 9.566423872235451e-07, + "loss": 0.6009, + "step": 11594 + }, + { + "epoch": 4.280211666256461, + "grad_norm": 0.9384942054748535, + "learning_rate": 9.56259788194861e-07, + "loss": 0.5408, + "step": 11595 + }, + { + "epoch": 4.280580851587497, + "grad_norm": 0.9239780306816101, + "learning_rate": 9.558772475962578e-07, + "loss": 0.54, + "step": 11596 + }, + { + "epoch": 4.280950036918533, + "grad_norm": 0.9453583359718323, + "learning_rate": 9.554947654422134e-07, + "loss": 0.5598, + "step": 11597 + }, + { + "epoch": 4.28131922224957, + "grad_norm": 0.9206327199935913, + "learning_rate": 9.55112341747204e-07, + "loss": 0.6069, + "step": 11598 + }, + { + "epoch": 4.281688407580606, + "grad_norm": 0.9627348780632019, + "learning_rate": 9.547299765257053e-07, + "loss": 0.6517, + "step": 11599 + }, + { + "epoch": 4.282057592911642, + "grad_norm": 0.9433503746986389, + "learning_rate": 9.543476697921896e-07, + "loss": 0.5522, + "step": 11600 + }, + { + "epoch": 4.2824267782426775, + "grad_norm": 0.9448719024658203, + "learning_rate": 9.539654215611258e-07, + "loss": 0.6062, + "step": 11601 + }, + { + "epoch": 4.282795963573714, + "grad_norm": 0.9868447780609131, + "learning_rate": 9.535832318469837e-07, + "loss": 0.5677, + "step": 11602 + }, + { + "epoch": 4.28316514890475, + "grad_norm": 0.9266169667243958, + "learning_rate": 9.532011006642275e-07, + "loss": 0.5632, + "step": 11603 + }, + { + "epoch": 4.283534334235786, + "grad_norm": 0.9586525559425354, + "learning_rate": 9.528190280273222e-07, + "loss": 0.6022, + "step": 11604 + }, + { + "epoch": 4.283903519566823, + "grad_norm": 0.9323959946632385, + "learning_rate": 9.524370139507286e-07, + "loss": 0.568, + "step": 11605 + }, + { + "epoch": 4.284272704897859, + "grad_norm": 0.9366006255149841, + "learning_rate": 9.520550584489047e-07, + "loss": 0.5627, + "step": 11606 + }, + { + "epoch": 4.284641890228895, + "grad_norm": 0.9244582653045654, + "learning_rate": 9.516731615363089e-07, + "loss": 0.5706, + "step": 11607 + }, + { + "epoch": 4.285011075559931, + "grad_norm": 0.9461366534233093, + "learning_rate": 9.512913232273954e-07, + "loss": 0.554, + "step": 11608 + }, + { + "epoch": 4.2853802608909675, + "grad_norm": 0.9419654011726379, + "learning_rate": 9.509095435366159e-07, + "loss": 0.5784, + "step": 11609 + }, + { + "epoch": 4.285749446222003, + "grad_norm": 0.988690972328186, + "learning_rate": 9.505278224784221e-07, + "loss": 0.618, + "step": 11610 + }, + { + "epoch": 4.286118631553039, + "grad_norm": 0.9642881751060486, + "learning_rate": 9.501461600672599e-07, + "loss": 0.5522, + "step": 11611 + }, + { + "epoch": 4.286487816884076, + "grad_norm": 0.9291445016860962, + "learning_rate": 9.497645563175787e-07, + "loss": 0.5759, + "step": 11612 + }, + { + "epoch": 4.286857002215112, + "grad_norm": 0.9429658651351929, + "learning_rate": 9.493830112438177e-07, + "loss": 0.5911, + "step": 11613 + }, + { + "epoch": 4.287226187546148, + "grad_norm": 0.9571182727813721, + "learning_rate": 9.490015248604204e-07, + "loss": 0.559, + "step": 11614 + }, + { + "epoch": 4.287595372877185, + "grad_norm": 0.9699932932853699, + "learning_rate": 9.486200971818266e-07, + "loss": 0.5693, + "step": 11615 + }, + { + "epoch": 4.287964558208221, + "grad_norm": 0.9233858585357666, + "learning_rate": 9.482387282224729e-07, + "loss": 0.5735, + "step": 11616 + }, + { + "epoch": 4.288333743539257, + "grad_norm": 0.931090235710144, + "learning_rate": 9.478574179967931e-07, + "loss": 0.5272, + "step": 11617 + }, + { + "epoch": 4.2887029288702925, + "grad_norm": 1.0113381147384644, + "learning_rate": 9.474761665192195e-07, + "loss": 0.5813, + "step": 11618 + }, + { + "epoch": 4.289072114201329, + "grad_norm": 0.9617070555686951, + "learning_rate": 9.470949738041841e-07, + "loss": 0.6117, + "step": 11619 + }, + { + "epoch": 4.289441299532365, + "grad_norm": 0.96503084897995, + "learning_rate": 9.467138398661135e-07, + "loss": 0.5686, + "step": 11620 + }, + { + "epoch": 4.289810484863401, + "grad_norm": 0.9391685128211975, + "learning_rate": 9.463327647194331e-07, + "loss": 0.5521, + "step": 11621 + }, + { + "epoch": 4.290179670194438, + "grad_norm": 0.9423322081565857, + "learning_rate": 9.459517483785683e-07, + "loss": 0.5941, + "step": 11622 + }, + { + "epoch": 4.290548855525474, + "grad_norm": 0.9177348613739014, + "learning_rate": 9.455707908579381e-07, + "loss": 0.5917, + "step": 11623 + }, + { + "epoch": 4.29091804085651, + "grad_norm": 0.9365488290786743, + "learning_rate": 9.45189892171964e-07, + "loss": 0.5841, + "step": 11624 + }, + { + "epoch": 4.2912872261875465, + "grad_norm": 0.983862578868866, + "learning_rate": 9.448090523350617e-07, + "loss": 0.5819, + "step": 11625 + }, + { + "epoch": 4.2916564115185825, + "grad_norm": 0.9400773048400879, + "learning_rate": 9.444282713616451e-07, + "loss": 0.6054, + "step": 11626 + }, + { + "epoch": 4.292025596849618, + "grad_norm": 0.9595383405685425, + "learning_rate": 9.440475492661285e-07, + "loss": 0.5508, + "step": 11627 + }, + { + "epoch": 4.292394782180654, + "grad_norm": 0.934643030166626, + "learning_rate": 9.436668860629208e-07, + "loss": 0.5478, + "step": 11628 + }, + { + "epoch": 4.292763967511691, + "grad_norm": 0.9168581366539001, + "learning_rate": 9.432862817664299e-07, + "loss": 0.5554, + "step": 11629 + }, + { + "epoch": 4.293133152842727, + "grad_norm": 0.9542291760444641, + "learning_rate": 9.429057363910624e-07, + "loss": 0.5733, + "step": 11630 + }, + { + "epoch": 4.293502338173763, + "grad_norm": 0.947027862071991, + "learning_rate": 9.425252499512205e-07, + "loss": 0.5705, + "step": 11631 + }, + { + "epoch": 4.2938715235048, + "grad_norm": 0.9499085545539856, + "learning_rate": 9.42144822461308e-07, + "loss": 0.5678, + "step": 11632 + }, + { + "epoch": 4.294240708835836, + "grad_norm": 0.9399240016937256, + "learning_rate": 9.417644539357207e-07, + "loss": 0.5636, + "step": 11633 + }, + { + "epoch": 4.2946098941668716, + "grad_norm": 0.9563084840774536, + "learning_rate": 9.413841443888571e-07, + "loss": 0.5913, + "step": 11634 + }, + { + "epoch": 4.294979079497908, + "grad_norm": 0.9646972417831421, + "learning_rate": 9.410038938351123e-07, + "loss": 0.5859, + "step": 11635 + }, + { + "epoch": 4.295348264828944, + "grad_norm": 0.9660083055496216, + "learning_rate": 9.406237022888784e-07, + "loss": 0.5624, + "step": 11636 + }, + { + "epoch": 4.29571745015998, + "grad_norm": 0.9258942008018494, + "learning_rate": 9.402435697645443e-07, + "loss": 0.5469, + "step": 11637 + }, + { + "epoch": 4.296086635491016, + "grad_norm": 0.9499889612197876, + "learning_rate": 9.398634962764996e-07, + "loss": 0.5632, + "step": 11638 + }, + { + "epoch": 4.296455820822053, + "grad_norm": 0.9713356494903564, + "learning_rate": 9.394834818391294e-07, + "loss": 0.5529, + "step": 11639 + }, + { + "epoch": 4.296825006153089, + "grad_norm": 0.9376131892204285, + "learning_rate": 9.391035264668166e-07, + "loss": 0.5588, + "step": 11640 + }, + { + "epoch": 4.297194191484125, + "grad_norm": 0.912166178226471, + "learning_rate": 9.387236301739419e-07, + "loss": 0.5638, + "step": 11641 + }, + { + "epoch": 4.2975633768151615, + "grad_norm": 0.979503333568573, + "learning_rate": 9.383437929748851e-07, + "loss": 0.581, + "step": 11642 + }, + { + "epoch": 4.2979325621461975, + "grad_norm": 0.9831344485282898, + "learning_rate": 9.379640148840238e-07, + "loss": 0.5528, + "step": 11643 + }, + { + "epoch": 4.298301747477233, + "grad_norm": 0.9622329473495483, + "learning_rate": 9.375842959157317e-07, + "loss": 0.5871, + "step": 11644 + }, + { + "epoch": 4.298670932808269, + "grad_norm": 0.9995614290237427, + "learning_rate": 9.372046360843807e-07, + "loss": 0.5926, + "step": 11645 + }, + { + "epoch": 4.299040118139306, + "grad_norm": 0.9551502466201782, + "learning_rate": 9.368250354043398e-07, + "loss": 0.5706, + "step": 11646 + }, + { + "epoch": 4.299409303470342, + "grad_norm": 0.9517110586166382, + "learning_rate": 9.364454938899792e-07, + "loss": 0.5341, + "step": 11647 + }, + { + "epoch": 4.299778488801378, + "grad_norm": 0.9420505166053772, + "learning_rate": 9.360660115556633e-07, + "loss": 0.5931, + "step": 11648 + }, + { + "epoch": 4.300147674132415, + "grad_norm": 0.9698629379272461, + "learning_rate": 9.356865884157543e-07, + "loss": 0.5755, + "step": 11649 + }, + { + "epoch": 4.300516859463451, + "grad_norm": 0.9747273921966553, + "learning_rate": 9.353072244846151e-07, + "loss": 0.5632, + "step": 11650 + }, + { + "epoch": 4.3008860447944866, + "grad_norm": 0.9674713015556335, + "learning_rate": 9.349279197766026e-07, + "loss": 0.59, + "step": 11651 + }, + { + "epoch": 4.301255230125523, + "grad_norm": 0.9313544631004333, + "learning_rate": 9.345486743060752e-07, + "loss": 0.5785, + "step": 11652 + }, + { + "epoch": 4.301624415456559, + "grad_norm": 0.9344936013221741, + "learning_rate": 9.341694880873866e-07, + "loss": 0.5505, + "step": 11653 + }, + { + "epoch": 4.301993600787595, + "grad_norm": 0.9618654251098633, + "learning_rate": 9.337903611348878e-07, + "loss": 0.544, + "step": 11654 + }, + { + "epoch": 4.302362786118632, + "grad_norm": 0.9344484210014343, + "learning_rate": 9.334112934629305e-07, + "loss": 0.6106, + "step": 11655 + }, + { + "epoch": 4.302731971449668, + "grad_norm": 0.9783422350883484, + "learning_rate": 9.330322850858611e-07, + "loss": 0.5524, + "step": 11656 + }, + { + "epoch": 4.303101156780704, + "grad_norm": 0.95818692445755, + "learning_rate": 9.326533360180246e-07, + "loss": 0.5556, + "step": 11657 + }, + { + "epoch": 4.30347034211174, + "grad_norm": 0.9439405202865601, + "learning_rate": 9.322744462737654e-07, + "loss": 0.5792, + "step": 11658 + }, + { + "epoch": 4.3038395274427765, + "grad_norm": 0.9300701022148132, + "learning_rate": 9.318956158674236e-07, + "loss": 0.5259, + "step": 11659 + }, + { + "epoch": 4.3042087127738125, + "grad_norm": 0.8873725533485413, + "learning_rate": 9.315168448133377e-07, + "loss": 0.4932, + "step": 11660 + }, + { + "epoch": 4.304577898104848, + "grad_norm": 0.9237948060035706, + "learning_rate": 9.311381331258435e-07, + "loss": 0.5799, + "step": 11661 + }, + { + "epoch": 4.304947083435885, + "grad_norm": 0.9642597436904907, + "learning_rate": 9.307594808192757e-07, + "loss": 0.575, + "step": 11662 + }, + { + "epoch": 4.305316268766921, + "grad_norm": 0.933371365070343, + "learning_rate": 9.303808879079674e-07, + "loss": 0.5384, + "step": 11663 + }, + { + "epoch": 4.305685454097957, + "grad_norm": 0.9347201585769653, + "learning_rate": 9.30002354406247e-07, + "loss": 0.5672, + "step": 11664 + }, + { + "epoch": 4.306054639428993, + "grad_norm": 0.9294368624687195, + "learning_rate": 9.296238803284419e-07, + "loss": 0.5819, + "step": 11665 + }, + { + "epoch": 4.30642382476003, + "grad_norm": 0.9697225689888, + "learning_rate": 9.292454656888763e-07, + "loss": 0.5576, + "step": 11666 + }, + { + "epoch": 4.306793010091066, + "grad_norm": 0.9670032858848572, + "learning_rate": 9.288671105018751e-07, + "loss": 0.5704, + "step": 11667 + }, + { + "epoch": 4.3071621954221015, + "grad_norm": 0.9192411303520203, + "learning_rate": 9.284888147817581e-07, + "loss": 0.5794, + "step": 11668 + }, + { + "epoch": 4.307531380753138, + "grad_norm": 0.951052188873291, + "learning_rate": 9.281105785428423e-07, + "loss": 0.5835, + "step": 11669 + }, + { + "epoch": 4.307900566084174, + "grad_norm": 0.9537923336029053, + "learning_rate": 9.277324017994463e-07, + "loss": 0.5625, + "step": 11670 + }, + { + "epoch": 4.30826975141521, + "grad_norm": 0.927956223487854, + "learning_rate": 9.273542845658814e-07, + "loss": 0.5583, + "step": 11671 + }, + { + "epoch": 4.308638936746247, + "grad_norm": 0.9209533333778381, + "learning_rate": 9.269762268564616e-07, + "loss": 0.5722, + "step": 11672 + }, + { + "epoch": 4.309008122077283, + "grad_norm": 0.9634197354316711, + "learning_rate": 9.26598228685495e-07, + "loss": 0.5933, + "step": 11673 + }, + { + "epoch": 4.309377307408319, + "grad_norm": 0.9448256492614746, + "learning_rate": 9.262202900672881e-07, + "loss": 0.5758, + "step": 11674 + }, + { + "epoch": 4.309746492739356, + "grad_norm": 0.9850823283195496, + "learning_rate": 9.258424110161474e-07, + "loss": 0.5707, + "step": 11675 + }, + { + "epoch": 4.3101156780703915, + "grad_norm": 0.9574640393257141, + "learning_rate": 9.254645915463744e-07, + "loss": 0.5683, + "step": 11676 + }, + { + "epoch": 4.3104848634014274, + "grad_norm": 0.9677302241325378, + "learning_rate": 9.250868316722691e-07, + "loss": 0.563, + "step": 11677 + }, + { + "epoch": 4.310854048732463, + "grad_norm": 0.9233808517456055, + "learning_rate": 9.247091314081311e-07, + "loss": 0.5845, + "step": 11678 + }, + { + "epoch": 4.3112232340635, + "grad_norm": 0.940577507019043, + "learning_rate": 9.243314907682551e-07, + "loss": 0.5634, + "step": 11679 + }, + { + "epoch": 4.311592419394536, + "grad_norm": 0.9542648792266846, + "learning_rate": 9.23953909766934e-07, + "loss": 0.5681, + "step": 11680 + }, + { + "epoch": 4.311961604725572, + "grad_norm": 0.9475418329238892, + "learning_rate": 9.235763884184609e-07, + "loss": 0.5759, + "step": 11681 + }, + { + "epoch": 4.312330790056609, + "grad_norm": 0.9954706430435181, + "learning_rate": 9.231989267371233e-07, + "loss": 0.5731, + "step": 11682 + }, + { + "epoch": 4.312699975387645, + "grad_norm": 0.9339395761489868, + "learning_rate": 9.228215247372093e-07, + "loss": 0.5514, + "step": 11683 + }, + { + "epoch": 4.313069160718681, + "grad_norm": 0.9515482783317566, + "learning_rate": 9.22444182433003e-07, + "loss": 0.5537, + "step": 11684 + }, + { + "epoch": 4.3134383460497165, + "grad_norm": 0.9294950366020203, + "learning_rate": 9.220668998387855e-07, + "loss": 0.5675, + "step": 11685 + }, + { + "epoch": 4.313807531380753, + "grad_norm": 0.97634357213974, + "learning_rate": 9.216896769688389e-07, + "loss": 0.5642, + "step": 11686 + }, + { + "epoch": 4.314176716711789, + "grad_norm": 1.006697416305542, + "learning_rate": 9.213125138374396e-07, + "loss": 0.5967, + "step": 11687 + }, + { + "epoch": 4.314545902042825, + "grad_norm": 0.9393929839134216, + "learning_rate": 9.209354104588635e-07, + "loss": 0.5552, + "step": 11688 + }, + { + "epoch": 4.314915087373862, + "grad_norm": 0.9478394389152527, + "learning_rate": 9.205583668473831e-07, + "loss": 0.593, + "step": 11689 + }, + { + "epoch": 4.315284272704898, + "grad_norm": 0.9661380052566528, + "learning_rate": 9.201813830172698e-07, + "loss": 0.5671, + "step": 11690 + }, + { + "epoch": 4.315653458035934, + "grad_norm": 0.9681259393692017, + "learning_rate": 9.198044589827937e-07, + "loss": 0.5841, + "step": 11691 + }, + { + "epoch": 4.316022643366971, + "grad_norm": 0.9442054033279419, + "learning_rate": 9.194275947582198e-07, + "loss": 0.5338, + "step": 11692 + }, + { + "epoch": 4.3163918286980065, + "grad_norm": 0.9242862462997437, + "learning_rate": 9.190507903578128e-07, + "loss": 0.5462, + "step": 11693 + }, + { + "epoch": 4.316761014029042, + "grad_norm": 1.209479808807373, + "learning_rate": 9.186740457958335e-07, + "loss": 0.5497, + "step": 11694 + }, + { + "epoch": 4.317130199360078, + "grad_norm": 0.9620179533958435, + "learning_rate": 9.182973610865434e-07, + "loss": 0.5677, + "step": 11695 + }, + { + "epoch": 4.317499384691115, + "grad_norm": 0.9882169365882874, + "learning_rate": 9.179207362441988e-07, + "loss": 0.5678, + "step": 11696 + }, + { + "epoch": 4.317868570022151, + "grad_norm": 0.9699048399925232, + "learning_rate": 9.175441712830546e-07, + "loss": 0.5654, + "step": 11697 + }, + { + "epoch": 4.318237755353187, + "grad_norm": 0.9386368989944458, + "learning_rate": 9.171676662173645e-07, + "loss": 0.564, + "step": 11698 + }, + { + "epoch": 4.318606940684224, + "grad_norm": 0.9236804842948914, + "learning_rate": 9.167912210613788e-07, + "loss": 0.5626, + "step": 11699 + }, + { + "epoch": 4.31897612601526, + "grad_norm": 0.9383452534675598, + "learning_rate": 9.16414835829345e-07, + "loss": 0.5738, + "step": 11700 + }, + { + "epoch": 4.319345311346296, + "grad_norm": 0.961529552936554, + "learning_rate": 9.160385105355108e-07, + "loss": 0.6076, + "step": 11701 + }, + { + "epoch": 4.319714496677332, + "grad_norm": 0.974827766418457, + "learning_rate": 9.15662245194118e-07, + "loss": 0.5611, + "step": 11702 + }, + { + "epoch": 4.320083682008368, + "grad_norm": 0.9736395478248596, + "learning_rate": 9.1528603981941e-07, + "loss": 0.5665, + "step": 11703 + }, + { + "epoch": 4.320452867339404, + "grad_norm": 0.9138890504837036, + "learning_rate": 9.149098944256255e-07, + "loss": 0.5614, + "step": 11704 + }, + { + "epoch": 4.32082205267044, + "grad_norm": 0.942080557346344, + "learning_rate": 9.14533809027e-07, + "loss": 0.6107, + "step": 11705 + }, + { + "epoch": 4.321191238001477, + "grad_norm": 1.0771946907043457, + "learning_rate": 9.141577836377704e-07, + "loss": 0.5618, + "step": 11706 + }, + { + "epoch": 4.321560423332513, + "grad_norm": 0.9549753665924072, + "learning_rate": 9.137818182721686e-07, + "loss": 0.5759, + "step": 11707 + }, + { + "epoch": 4.321929608663549, + "grad_norm": 0.9164910912513733, + "learning_rate": 9.134059129444239e-07, + "loss": 0.5626, + "step": 11708 + }, + { + "epoch": 4.322298793994586, + "grad_norm": 0.931551456451416, + "learning_rate": 9.13030067668764e-07, + "loss": 0.554, + "step": 11709 + }, + { + "epoch": 4.3226679793256215, + "grad_norm": 0.9808580279350281, + "learning_rate": 9.126542824594153e-07, + "loss": 0.5916, + "step": 11710 + }, + { + "epoch": 4.323037164656657, + "grad_norm": 0.9850585460662842, + "learning_rate": 9.122785573306017e-07, + "loss": 0.6033, + "step": 11711 + }, + { + "epoch": 4.323406349987694, + "grad_norm": 0.9594637155532837, + "learning_rate": 9.119028922965437e-07, + "loss": 0.5696, + "step": 11712 + }, + { + "epoch": 4.32377553531873, + "grad_norm": 0.9671863317489624, + "learning_rate": 9.11527287371459e-07, + "loss": 0.5743, + "step": 11713 + }, + { + "epoch": 4.324144720649766, + "grad_norm": 0.9407777786254883, + "learning_rate": 9.111517425695663e-07, + "loss": 0.5997, + "step": 11714 + }, + { + "epoch": 4.324513905980802, + "grad_norm": 0.9497171640396118, + "learning_rate": 9.107762579050786e-07, + "loss": 0.5765, + "step": 11715 + }, + { + "epoch": 4.324883091311839, + "grad_norm": 0.9601430892944336, + "learning_rate": 9.104008333922077e-07, + "loss": 0.5891, + "step": 11716 + }, + { + "epoch": 4.325252276642875, + "grad_norm": 0.9477629661560059, + "learning_rate": 9.100254690451629e-07, + "loss": 0.6038, + "step": 11717 + }, + { + "epoch": 4.325621461973911, + "grad_norm": 0.962644636631012, + "learning_rate": 9.096501648781522e-07, + "loss": 0.5747, + "step": 11718 + }, + { + "epoch": 4.325990647304947, + "grad_norm": 1.1011154651641846, + "learning_rate": 9.092749209053828e-07, + "loss": 0.5226, + "step": 11719 + }, + { + "epoch": 4.326359832635983, + "grad_norm": 0.9107478857040405, + "learning_rate": 9.088997371410537e-07, + "loss": 0.5712, + "step": 11720 + }, + { + "epoch": 4.326729017967019, + "grad_norm": 1.0019042491912842, + "learning_rate": 9.085246135993683e-07, + "loss": 0.624, + "step": 11721 + }, + { + "epoch": 4.327098203298055, + "grad_norm": 0.9451860189437866, + "learning_rate": 9.081495502945231e-07, + "loss": 0.5491, + "step": 11722 + }, + { + "epoch": 4.327467388629092, + "grad_norm": 0.9771292209625244, + "learning_rate": 9.07774547240716e-07, + "loss": 0.5949, + "step": 11723 + }, + { + "epoch": 4.327836573960128, + "grad_norm": 0.9653292894363403, + "learning_rate": 9.073996044521397e-07, + "loss": 0.5943, + "step": 11724 + }, + { + "epoch": 4.328205759291164, + "grad_norm": 0.9477372169494629, + "learning_rate": 9.070247219429848e-07, + "loss": 0.559, + "step": 11725 + }, + { + "epoch": 4.328574944622201, + "grad_norm": 0.9299965500831604, + "learning_rate": 9.066498997274423e-07, + "loss": 0.5325, + "step": 11726 + }, + { + "epoch": 4.3289441299532365, + "grad_norm": 0.9575318694114685, + "learning_rate": 9.062751378196985e-07, + "loss": 0.5733, + "step": 11727 + }, + { + "epoch": 4.329313315284272, + "grad_norm": 0.946064829826355, + "learning_rate": 9.059004362339366e-07, + "loss": 0.5879, + "step": 11728 + }, + { + "epoch": 4.329682500615309, + "grad_norm": 0.944401741027832, + "learning_rate": 9.05525794984341e-07, + "loss": 0.5835, + "step": 11729 + }, + { + "epoch": 4.330051685946345, + "grad_norm": 0.988174557685852, + "learning_rate": 9.051512140850898e-07, + "loss": 0.5649, + "step": 11730 + }, + { + "epoch": 4.330420871277381, + "grad_norm": 0.9570655822753906, + "learning_rate": 9.04776693550363e-07, + "loss": 0.557, + "step": 11731 + }, + { + "epoch": 4.330790056608418, + "grad_norm": 0.9296292066574097, + "learning_rate": 9.044022333943347e-07, + "loss": 0.5604, + "step": 11732 + }, + { + "epoch": 4.331159241939454, + "grad_norm": 0.9687544107437134, + "learning_rate": 9.040278336311775e-07, + "loss": 0.5605, + "step": 11733 + }, + { + "epoch": 4.33152842727049, + "grad_norm": 0.9758220314979553, + "learning_rate": 9.036534942750638e-07, + "loss": 0.6023, + "step": 11734 + }, + { + "epoch": 4.331897612601526, + "grad_norm": 0.9654886722564697, + "learning_rate": 9.032792153401617e-07, + "loss": 0.5825, + "step": 11735 + }, + { + "epoch": 4.332266797932562, + "grad_norm": 0.9513998627662659, + "learning_rate": 9.029049968406373e-07, + "loss": 0.5539, + "step": 11736 + }, + { + "epoch": 4.332635983263598, + "grad_norm": 0.9771547913551331, + "learning_rate": 9.02530838790654e-07, + "loss": 0.5442, + "step": 11737 + }, + { + "epoch": 4.333005168594634, + "grad_norm": 0.9748669862747192, + "learning_rate": 9.021567412043741e-07, + "loss": 0.5799, + "step": 11738 + }, + { + "epoch": 4.333374353925671, + "grad_norm": 0.9513847827911377, + "learning_rate": 9.01782704095959e-07, + "loss": 0.5779, + "step": 11739 + }, + { + "epoch": 4.333743539256707, + "grad_norm": 0.9777699112892151, + "learning_rate": 9.014087274795624e-07, + "loss": 0.5805, + "step": 11740 + }, + { + "epoch": 4.334112724587743, + "grad_norm": 0.954906165599823, + "learning_rate": 9.01034811369342e-07, + "loss": 0.5704, + "step": 11741 + }, + { + "epoch": 4.334481909918779, + "grad_norm": 0.9538619518280029, + "learning_rate": 9.006609557794482e-07, + "loss": 0.5854, + "step": 11742 + }, + { + "epoch": 4.334851095249816, + "grad_norm": 0.9784435033798218, + "learning_rate": 9.002871607240338e-07, + "loss": 0.5511, + "step": 11743 + }, + { + "epoch": 4.3352202805808515, + "grad_norm": 0.9371801018714905, + "learning_rate": 8.999134262172452e-07, + "loss": 0.5607, + "step": 11744 + }, + { + "epoch": 4.335589465911887, + "grad_norm": 0.9603213667869568, + "learning_rate": 8.995397522732274e-07, + "loss": 0.6023, + "step": 11745 + }, + { + "epoch": 4.335958651242924, + "grad_norm": 0.9691757559776306, + "learning_rate": 8.991661389061257e-07, + "loss": 0.581, + "step": 11746 + }, + { + "epoch": 4.33632783657396, + "grad_norm": 0.9567104578018188, + "learning_rate": 8.987925861300806e-07, + "loss": 0.5736, + "step": 11747 + }, + { + "epoch": 4.336697021904996, + "grad_norm": 0.9795947074890137, + "learning_rate": 8.984190939592299e-07, + "loss": 0.5994, + "step": 11748 + }, + { + "epoch": 4.337066207236033, + "grad_norm": 0.9451456069946289, + "learning_rate": 8.980456624077119e-07, + "loss": 0.5503, + "step": 11749 + }, + { + "epoch": 4.337435392567069, + "grad_norm": 0.9532627463340759, + "learning_rate": 8.97672291489659e-07, + "loss": 0.5659, + "step": 11750 + }, + { + "epoch": 4.337804577898105, + "grad_norm": 0.9584570527076721, + "learning_rate": 8.972989812192051e-07, + "loss": 0.587, + "step": 11751 + }, + { + "epoch": 4.3381737632291415, + "grad_norm": 0.9458928108215332, + "learning_rate": 8.96925731610479e-07, + "loss": 0.5323, + "step": 11752 + }, + { + "epoch": 4.338542948560177, + "grad_norm": 0.9202786684036255, + "learning_rate": 8.965525426776072e-07, + "loss": 0.5792, + "step": 11753 + }, + { + "epoch": 4.338912133891213, + "grad_norm": 0.9540140628814697, + "learning_rate": 8.961794144347163e-07, + "loss": 0.5788, + "step": 11754 + }, + { + "epoch": 4.339281319222249, + "grad_norm": 0.9474434852600098, + "learning_rate": 8.958063468959286e-07, + "loss": 0.5672, + "step": 11755 + }, + { + "epoch": 4.339650504553286, + "grad_norm": 0.939612090587616, + "learning_rate": 8.954333400753634e-07, + "loss": 0.5566, + "step": 11756 + }, + { + "epoch": 4.340019689884322, + "grad_norm": 0.9313437342643738, + "learning_rate": 8.950603939871408e-07, + "loss": 0.5552, + "step": 11757 + }, + { + "epoch": 4.340388875215358, + "grad_norm": 0.9385122656822205, + "learning_rate": 8.946875086453746e-07, + "loss": 0.5512, + "step": 11758 + }, + { + "epoch": 4.340758060546395, + "grad_norm": 0.9426828026771545, + "learning_rate": 8.943146840641817e-07, + "loss": 0.5571, + "step": 11759 + }, + { + "epoch": 4.341127245877431, + "grad_norm": 0.9484424591064453, + "learning_rate": 8.939419202576694e-07, + "loss": 0.549, + "step": 11760 + }, + { + "epoch": 4.3414964312084665, + "grad_norm": 0.940065324306488, + "learning_rate": 8.935692172399482e-07, + "loss": 0.5694, + "step": 11761 + }, + { + "epoch": 4.341865616539502, + "grad_norm": 0.9544074535369873, + "learning_rate": 8.931965750251264e-07, + "loss": 0.5783, + "step": 11762 + }, + { + "epoch": 4.342234801870539, + "grad_norm": 0.9617428183555603, + "learning_rate": 8.928239936273069e-07, + "loss": 0.6003, + "step": 11763 + }, + { + "epoch": 4.342603987201575, + "grad_norm": 0.9580993056297302, + "learning_rate": 8.92451473060592e-07, + "loss": 0.6125, + "step": 11764 + }, + { + "epoch": 4.342973172532611, + "grad_norm": 0.9601118564605713, + "learning_rate": 8.920790133390808e-07, + "loss": 0.5717, + "step": 11765 + }, + { + "epoch": 4.343342357863648, + "grad_norm": 0.9447959661483765, + "learning_rate": 8.917066144768713e-07, + "loss": 0.5906, + "step": 11766 + }, + { + "epoch": 4.343711543194684, + "grad_norm": 0.9591637849807739, + "learning_rate": 8.913342764880609e-07, + "loss": 0.5563, + "step": 11767 + }, + { + "epoch": 4.34408072852572, + "grad_norm": 0.9428394436836243, + "learning_rate": 8.909619993867382e-07, + "loss": 0.6115, + "step": 11768 + }, + { + "epoch": 4.3444499138567565, + "grad_norm": 0.9578239321708679, + "learning_rate": 8.905897831869972e-07, + "loss": 0.6073, + "step": 11769 + }, + { + "epoch": 4.344819099187792, + "grad_norm": 0.95002681016922, + "learning_rate": 8.902176279029243e-07, + "loss": 0.589, + "step": 11770 + }, + { + "epoch": 4.345188284518828, + "grad_norm": 0.9450660943984985, + "learning_rate": 8.898455335486067e-07, + "loss": 0.5934, + "step": 11771 + }, + { + "epoch": 4.345557469849864, + "grad_norm": 1.023120403289795, + "learning_rate": 8.894735001381275e-07, + "loss": 0.5887, + "step": 11772 + }, + { + "epoch": 4.345926655180901, + "grad_norm": 0.9362996220588684, + "learning_rate": 8.891015276855672e-07, + "loss": 0.5811, + "step": 11773 + }, + { + "epoch": 4.346295840511937, + "grad_norm": 0.9785075783729553, + "learning_rate": 8.887296162050066e-07, + "loss": 0.5756, + "step": 11774 + }, + { + "epoch": 4.346665025842973, + "grad_norm": 0.92833411693573, + "learning_rate": 8.883577657105213e-07, + "loss": 0.5672, + "step": 11775 + }, + { + "epoch": 4.34703421117401, + "grad_norm": 0.963888943195343, + "learning_rate": 8.879859762161852e-07, + "loss": 0.6158, + "step": 11776 + }, + { + "epoch": 4.347403396505046, + "grad_norm": 0.9391088485717773, + "learning_rate": 8.876142477360719e-07, + "loss": 0.516, + "step": 11777 + }, + { + "epoch": 4.3477725818360815, + "grad_norm": 0.9175420999526978, + "learning_rate": 8.872425802842497e-07, + "loss": 0.5574, + "step": 11778 + }, + { + "epoch": 4.348141767167117, + "grad_norm": 0.9314680099487305, + "learning_rate": 8.868709738747877e-07, + "loss": 0.554, + "step": 11779 + }, + { + "epoch": 4.348510952498154, + "grad_norm": 0.9542201161384583, + "learning_rate": 8.864994285217501e-07, + "loss": 0.5656, + "step": 11780 + }, + { + "epoch": 4.34888013782919, + "grad_norm": 0.956017017364502, + "learning_rate": 8.861279442391993e-07, + "loss": 0.5703, + "step": 11781 + }, + { + "epoch": 4.349249323160226, + "grad_norm": 0.9860715866088867, + "learning_rate": 8.857565210411972e-07, + "loss": 0.5632, + "step": 11782 + }, + { + "epoch": 4.349618508491263, + "grad_norm": 0.9152863025665283, + "learning_rate": 8.853851589418014e-07, + "loss": 0.5352, + "step": 11783 + }, + { + "epoch": 4.349987693822299, + "grad_norm": 0.992739200592041, + "learning_rate": 8.850138579550674e-07, + "loss": 0.5552, + "step": 11784 + }, + { + "epoch": 4.350356879153335, + "grad_norm": 0.9087722897529602, + "learning_rate": 8.846426180950487e-07, + "loss": 0.5577, + "step": 11785 + }, + { + "epoch": 4.3507260644843715, + "grad_norm": 0.9655054211616516, + "learning_rate": 8.84271439375797e-07, + "loss": 0.5524, + "step": 11786 + }, + { + "epoch": 4.351095249815407, + "grad_norm": 0.9300803542137146, + "learning_rate": 8.839003218113631e-07, + "loss": 0.5766, + "step": 11787 + }, + { + "epoch": 4.351464435146443, + "grad_norm": 0.9543015956878662, + "learning_rate": 8.835292654157901e-07, + "loss": 0.6091, + "step": 11788 + }, + { + "epoch": 4.35183362047748, + "grad_norm": 0.9418762922286987, + "learning_rate": 8.831582702031244e-07, + "loss": 0.5804, + "step": 11789 + }, + { + "epoch": 4.352202805808516, + "grad_norm": 0.9625342488288879, + "learning_rate": 8.827873361874087e-07, + "loss": 0.5882, + "step": 11790 + }, + { + "epoch": 4.352571991139552, + "grad_norm": 0.9239534735679626, + "learning_rate": 8.824164633826821e-07, + "loss": 0.5519, + "step": 11791 + }, + { + "epoch": 4.352941176470588, + "grad_norm": 0.9605569243431091, + "learning_rate": 8.820456518029816e-07, + "loss": 0.5193, + "step": 11792 + }, + { + "epoch": 4.353310361801625, + "grad_norm": 0.9193978905677795, + "learning_rate": 8.816749014623416e-07, + "loss": 0.5411, + "step": 11793 + }, + { + "epoch": 4.353679547132661, + "grad_norm": 0.982879102230072, + "learning_rate": 8.813042123747962e-07, + "loss": 0.552, + "step": 11794 + }, + { + "epoch": 4.3540487324636965, + "grad_norm": 0.9528630971908569, + "learning_rate": 8.809335845543768e-07, + "loss": 0.5735, + "step": 11795 + }, + { + "epoch": 4.354417917794733, + "grad_norm": 0.9991348385810852, + "learning_rate": 8.805630180151087e-07, + "loss": 0.5373, + "step": 11796 + }, + { + "epoch": 4.354787103125769, + "grad_norm": 0.9417344927787781, + "learning_rate": 8.801925127710201e-07, + "loss": 0.6339, + "step": 11797 + }, + { + "epoch": 4.355156288456805, + "grad_norm": 0.9548045992851257, + "learning_rate": 8.79822068836133e-07, + "loss": 0.5426, + "step": 11798 + }, + { + "epoch": 4.355525473787841, + "grad_norm": 0.9323415160179138, + "learning_rate": 8.794516862244701e-07, + "loss": 0.5599, + "step": 11799 + }, + { + "epoch": 4.355894659118878, + "grad_norm": 0.9703313112258911, + "learning_rate": 8.79081364950049e-07, + "loss": 0.566, + "step": 11800 + }, + { + "epoch": 4.356263844449914, + "grad_norm": 0.9231736660003662, + "learning_rate": 8.787111050268862e-07, + "loss": 0.5709, + "step": 11801 + }, + { + "epoch": 4.35663302978095, + "grad_norm": 0.9423268437385559, + "learning_rate": 8.783409064689971e-07, + "loss": 0.5777, + "step": 11802 + }, + { + "epoch": 4.3570022151119865, + "grad_norm": 0.9426971673965454, + "learning_rate": 8.77970769290393e-07, + "loss": 0.5693, + "step": 11803 + }, + { + "epoch": 4.357371400443022, + "grad_norm": 0.933464527130127, + "learning_rate": 8.776006935050826e-07, + "loss": 0.5444, + "step": 11804 + }, + { + "epoch": 4.357740585774058, + "grad_norm": 0.9233007431030273, + "learning_rate": 8.772306791270746e-07, + "loss": 0.5582, + "step": 11805 + }, + { + "epoch": 4.358109771105095, + "grad_norm": 0.9715560674667358, + "learning_rate": 8.768607261703724e-07, + "loss": 0.5763, + "step": 11806 + }, + { + "epoch": 4.358478956436131, + "grad_norm": 0.9661466479301453, + "learning_rate": 8.764908346489812e-07, + "loss": 0.606, + "step": 11807 + }, + { + "epoch": 4.358848141767167, + "grad_norm": 0.981696367263794, + "learning_rate": 8.761210045768975e-07, + "loss": 0.5196, + "step": 11808 + }, + { + "epoch": 4.359217327098204, + "grad_norm": 0.9214773774147034, + "learning_rate": 8.757512359681217e-07, + "loss": 0.5347, + "step": 11809 + }, + { + "epoch": 4.35958651242924, + "grad_norm": 0.943116307258606, + "learning_rate": 8.753815288366496e-07, + "loss": 0.5649, + "step": 11810 + }, + { + "epoch": 4.359955697760276, + "grad_norm": 0.9609165191650391, + "learning_rate": 8.750118831964741e-07, + "loss": 0.5595, + "step": 11811 + }, + { + "epoch": 4.3603248830913115, + "grad_norm": 0.9474302530288696, + "learning_rate": 8.746422990615857e-07, + "loss": 0.546, + "step": 11812 + }, + { + "epoch": 4.360694068422348, + "grad_norm": 0.9676606059074402, + "learning_rate": 8.742727764459727e-07, + "loss": 0.5401, + "step": 11813 + }, + { + "epoch": 4.361063253753384, + "grad_norm": 0.9415005445480347, + "learning_rate": 8.73903315363622e-07, + "loss": 0.559, + "step": 11814 + }, + { + "epoch": 4.36143243908442, + "grad_norm": 0.9457535147666931, + "learning_rate": 8.735339158285191e-07, + "loss": 0.5732, + "step": 11815 + }, + { + "epoch": 4.361801624415457, + "grad_norm": 0.9772605299949646, + "learning_rate": 8.731645778546427e-07, + "loss": 0.5784, + "step": 11816 + }, + { + "epoch": 4.362170809746493, + "grad_norm": 0.9511733651161194, + "learning_rate": 8.727953014559745e-07, + "loss": 0.5779, + "step": 11817 + }, + { + "epoch": 4.362539995077529, + "grad_norm": 0.9586591124534607, + "learning_rate": 8.724260866464895e-07, + "loss": 0.5834, + "step": 11818 + }, + { + "epoch": 4.362909180408565, + "grad_norm": 0.9448732733726501, + "learning_rate": 8.720569334401643e-07, + "loss": 0.5946, + "step": 11819 + }, + { + "epoch": 4.3632783657396015, + "grad_norm": 0.9664692878723145, + "learning_rate": 8.716878418509706e-07, + "loss": 0.5669, + "step": 11820 + }, + { + "epoch": 4.363647551070637, + "grad_norm": 0.9823465943336487, + "learning_rate": 8.713188118928773e-07, + "loss": 0.5659, + "step": 11821 + }, + { + "epoch": 4.364016736401673, + "grad_norm": 0.9678393006324768, + "learning_rate": 8.709498435798536e-07, + "loss": 0.5636, + "step": 11822 + }, + { + "epoch": 4.36438592173271, + "grad_norm": 0.9920006990432739, + "learning_rate": 8.705809369258645e-07, + "loss": 0.5777, + "step": 11823 + }, + { + "epoch": 4.364755107063746, + "grad_norm": 0.9479106664657593, + "learning_rate": 8.702120919448715e-07, + "loss": 0.5448, + "step": 11824 + }, + { + "epoch": 4.365124292394782, + "grad_norm": 0.940296471118927, + "learning_rate": 8.698433086508373e-07, + "loss": 0.5798, + "step": 11825 + }, + { + "epoch": 4.365493477725819, + "grad_norm": 0.9450646042823792, + "learning_rate": 8.694745870577187e-07, + "loss": 0.5237, + "step": 11826 + }, + { + "epoch": 4.365862663056855, + "grad_norm": 0.9315647482872009, + "learning_rate": 8.691059271794733e-07, + "loss": 0.5984, + "step": 11827 + }, + { + "epoch": 4.366231848387891, + "grad_norm": 0.9655883312225342, + "learning_rate": 8.687373290300535e-07, + "loss": 0.5634, + "step": 11828 + }, + { + "epoch": 4.3666010337189265, + "grad_norm": 0.934833288192749, + "learning_rate": 8.683687926234105e-07, + "loss": 0.5658, + "step": 11829 + }, + { + "epoch": 4.366970219049963, + "grad_norm": 0.9539170861244202, + "learning_rate": 8.680003179734944e-07, + "loss": 0.5775, + "step": 11830 + }, + { + "epoch": 4.367339404380999, + "grad_norm": 0.9464873671531677, + "learning_rate": 8.676319050942513e-07, + "loss": 0.5809, + "step": 11831 + }, + { + "epoch": 4.367708589712035, + "grad_norm": 0.9396985173225403, + "learning_rate": 8.672635539996243e-07, + "loss": 0.584, + "step": 11832 + }, + { + "epoch": 4.368077775043072, + "grad_norm": 0.9737354516983032, + "learning_rate": 8.668952647035578e-07, + "loss": 0.5893, + "step": 11833 + }, + { + "epoch": 4.368446960374108, + "grad_norm": 0.9333685636520386, + "learning_rate": 8.665270372199889e-07, + "loss": 0.5769, + "step": 11834 + }, + { + "epoch": 4.368816145705144, + "grad_norm": 0.9441033601760864, + "learning_rate": 8.66158871562858e-07, + "loss": 0.5584, + "step": 11835 + }, + { + "epoch": 4.369185331036181, + "grad_norm": 0.9658287167549133, + "learning_rate": 8.657907677460961e-07, + "loss": 0.5913, + "step": 11836 + }, + { + "epoch": 4.3695545163672165, + "grad_norm": 0.9547070860862732, + "learning_rate": 8.654227257836384e-07, + "loss": 0.5654, + "step": 11837 + }, + { + "epoch": 4.369923701698252, + "grad_norm": 0.9415487051010132, + "learning_rate": 8.650547456894151e-07, + "loss": 0.5801, + "step": 11838 + }, + { + "epoch": 4.370292887029288, + "grad_norm": 0.9755802154541016, + "learning_rate": 8.646868274773537e-07, + "loss": 0.6004, + "step": 11839 + }, + { + "epoch": 4.370662072360325, + "grad_norm": 0.9656595587730408, + "learning_rate": 8.6431897116138e-07, + "loss": 0.5818, + "step": 11840 + }, + { + "epoch": 4.371031257691361, + "grad_norm": 0.8946307301521301, + "learning_rate": 8.63951176755416e-07, + "loss": 0.5364, + "step": 11841 + }, + { + "epoch": 4.371400443022397, + "grad_norm": 0.938202440738678, + "learning_rate": 8.635834442733836e-07, + "loss": 0.5762, + "step": 11842 + }, + { + "epoch": 4.371769628353434, + "grad_norm": 0.9651269316673279, + "learning_rate": 8.632157737292032e-07, + "loss": 0.5557, + "step": 11843 + }, + { + "epoch": 4.37213881368447, + "grad_norm": 0.9338123202323914, + "learning_rate": 8.628481651367876e-07, + "loss": 0.5672, + "step": 11844 + }, + { + "epoch": 4.372507999015506, + "grad_norm": 0.952494204044342, + "learning_rate": 8.624806185100528e-07, + "loss": 0.5652, + "step": 11845 + }, + { + "epoch": 4.372877184346542, + "grad_norm": 0.9524948000907898, + "learning_rate": 8.621131338629091e-07, + "loss": 0.5933, + "step": 11846 + }, + { + "epoch": 4.373246369677578, + "grad_norm": 0.9736582636833191, + "learning_rate": 8.617457112092673e-07, + "loss": 0.5604, + "step": 11847 + }, + { + "epoch": 4.373615555008614, + "grad_norm": 0.9738215208053589, + "learning_rate": 8.61378350563033e-07, + "loss": 0.5696, + "step": 11848 + }, + { + "epoch": 4.37398474033965, + "grad_norm": 0.9388606548309326, + "learning_rate": 8.610110519381104e-07, + "loss": 0.5772, + "step": 11849 + }, + { + "epoch": 4.374353925670687, + "grad_norm": 0.9545698165893555, + "learning_rate": 8.606438153484028e-07, + "loss": 0.5835, + "step": 11850 + }, + { + "epoch": 4.374723111001723, + "grad_norm": 0.9259491562843323, + "learning_rate": 8.602766408078095e-07, + "loss": 0.5366, + "step": 11851 + }, + { + "epoch": 4.375092296332759, + "grad_norm": 0.9208692908287048, + "learning_rate": 8.599095283302272e-07, + "loss": 0.5822, + "step": 11852 + }, + { + "epoch": 4.375461481663796, + "grad_norm": 0.9811751842498779, + "learning_rate": 8.595424779295522e-07, + "loss": 0.5744, + "step": 11853 + }, + { + "epoch": 4.3758306669948315, + "grad_norm": 0.9448829293251038, + "learning_rate": 8.591754896196761e-07, + "loss": 0.6035, + "step": 11854 + }, + { + "epoch": 4.376199852325867, + "grad_norm": 0.9564924836158752, + "learning_rate": 8.588085634144913e-07, + "loss": 0.5327, + "step": 11855 + }, + { + "epoch": 4.376569037656903, + "grad_norm": 0.9235681891441345, + "learning_rate": 8.584416993278829e-07, + "loss": 0.586, + "step": 11856 + }, + { + "epoch": 4.37693822298794, + "grad_norm": 0.9572246074676514, + "learning_rate": 8.580748973737379e-07, + "loss": 0.5867, + "step": 11857 + }, + { + "epoch": 4.377307408318976, + "grad_norm": 0.9639289379119873, + "learning_rate": 8.577081575659407e-07, + "loss": 0.5523, + "step": 11858 + }, + { + "epoch": 4.377676593650012, + "grad_norm": 0.9525726437568665, + "learning_rate": 8.573414799183716e-07, + "loss": 0.5468, + "step": 11859 + }, + { + "epoch": 4.378045778981049, + "grad_norm": 0.9295569062232971, + "learning_rate": 8.569748644449083e-07, + "loss": 0.5307, + "step": 11860 + }, + { + "epoch": 4.378414964312085, + "grad_norm": 0.9464076161384583, + "learning_rate": 8.566083111594284e-07, + "loss": 0.5505, + "step": 11861 + }, + { + "epoch": 4.378784149643121, + "grad_norm": 0.9370673298835754, + "learning_rate": 8.562418200758046e-07, + "loss": 0.5834, + "step": 11862 + }, + { + "epoch": 4.379153334974157, + "grad_norm": 0.9722147583961487, + "learning_rate": 8.558753912079107e-07, + "loss": 0.5728, + "step": 11863 + }, + { + "epoch": 4.379522520305193, + "grad_norm": 0.9576153755187988, + "learning_rate": 8.555090245696126e-07, + "loss": 0.5579, + "step": 11864 + }, + { + "epoch": 4.379891705636229, + "grad_norm": 0.9389671683311462, + "learning_rate": 8.551427201747791e-07, + "loss": 0.5693, + "step": 11865 + }, + { + "epoch": 4.380260890967266, + "grad_norm": 0.9601026773452759, + "learning_rate": 8.547764780372752e-07, + "loss": 0.564, + "step": 11866 + }, + { + "epoch": 4.380630076298302, + "grad_norm": 0.9450255632400513, + "learning_rate": 8.544102981709626e-07, + "loss": 0.5988, + "step": 11867 + }, + { + "epoch": 4.380999261629338, + "grad_norm": 0.9509219527244568, + "learning_rate": 8.540441805897007e-07, + "loss": 0.5776, + "step": 11868 + }, + { + "epoch": 4.381368446960374, + "grad_norm": 0.9349048733711243, + "learning_rate": 8.536781253073461e-07, + "loss": 0.5895, + "step": 11869 + }, + { + "epoch": 4.3817376322914106, + "grad_norm": 0.9579769372940063, + "learning_rate": 8.533121323377558e-07, + "loss": 0.5616, + "step": 11870 + }, + { + "epoch": 4.3821068176224465, + "grad_norm": 0.9634438753128052, + "learning_rate": 8.529462016947817e-07, + "loss": 0.6268, + "step": 11871 + }, + { + "epoch": 4.382476002953482, + "grad_norm": 0.9524003267288208, + "learning_rate": 8.525803333922733e-07, + "loss": 0.6285, + "step": 11872 + }, + { + "epoch": 4.382845188284519, + "grad_norm": 0.9875625967979431, + "learning_rate": 8.522145274440799e-07, + "loss": 0.5647, + "step": 11873 + }, + { + "epoch": 4.383214373615555, + "grad_norm": 0.9479991793632507, + "learning_rate": 8.51848783864046e-07, + "loss": 0.5827, + "step": 11874 + }, + { + "epoch": 4.383583558946591, + "grad_norm": 0.9906951785087585, + "learning_rate": 8.514831026660161e-07, + "loss": 0.5833, + "step": 11875 + }, + { + "epoch": 4.383952744277627, + "grad_norm": 0.9754274487495422, + "learning_rate": 8.511174838638306e-07, + "loss": 0.5815, + "step": 11876 + }, + { + "epoch": 4.384321929608664, + "grad_norm": 0.9161077737808228, + "learning_rate": 8.507519274713272e-07, + "loss": 0.5782, + "step": 11877 + }, + { + "epoch": 4.3846911149397, + "grad_norm": 0.9740307331085205, + "learning_rate": 8.503864335023434e-07, + "loss": 0.5446, + "step": 11878 + }, + { + "epoch": 4.385060300270736, + "grad_norm": 0.9696300029754639, + "learning_rate": 8.500210019707128e-07, + "loss": 0.5967, + "step": 11879 + }, + { + "epoch": 4.385429485601772, + "grad_norm": 0.9476395845413208, + "learning_rate": 8.496556328902655e-07, + "loss": 0.5869, + "step": 11880 + }, + { + "epoch": 4.385798670932808, + "grad_norm": 0.9918621778488159, + "learning_rate": 8.492903262748326e-07, + "loss": 0.5638, + "step": 11881 + }, + { + "epoch": 4.386167856263844, + "grad_norm": 0.9162431359291077, + "learning_rate": 8.489250821382389e-07, + "loss": 0.5174, + "step": 11882 + }, + { + "epoch": 4.386537041594881, + "grad_norm": 0.9303983449935913, + "learning_rate": 8.485599004943112e-07, + "loss": 0.6016, + "step": 11883 + }, + { + "epoch": 4.386906226925917, + "grad_norm": 0.9793349504470825, + "learning_rate": 8.481947813568689e-07, + "loss": 0.5804, + "step": 11884 + }, + { + "epoch": 4.387275412256953, + "grad_norm": 0.9489377737045288, + "learning_rate": 8.478297247397324e-07, + "loss": 0.5505, + "step": 11885 + }, + { + "epoch": 4.38764459758799, + "grad_norm": 0.9533929824829102, + "learning_rate": 8.474647306567204e-07, + "loss": 0.581, + "step": 11886 + }, + { + "epoch": 4.3880137829190256, + "grad_norm": 0.9394321441650391, + "learning_rate": 8.470997991216468e-07, + "loss": 0.5806, + "step": 11887 + }, + { + "epoch": 4.3883829682500615, + "grad_norm": 0.9504750967025757, + "learning_rate": 8.467349301483241e-07, + "loss": 0.6018, + "step": 11888 + }, + { + "epoch": 4.388752153581097, + "grad_norm": 0.9540665745735168, + "learning_rate": 8.463701237505617e-07, + "loss": 0.5773, + "step": 11889 + }, + { + "epoch": 4.389121338912134, + "grad_norm": 0.9792432188987732, + "learning_rate": 8.460053799421691e-07, + "loss": 0.5843, + "step": 11890 + }, + { + "epoch": 4.38949052424317, + "grad_norm": 0.9442278742790222, + "learning_rate": 8.456406987369509e-07, + "loss": 0.5911, + "step": 11891 + }, + { + "epoch": 4.389859709574206, + "grad_norm": 1.0242265462875366, + "learning_rate": 8.452760801487092e-07, + "loss": 0.5649, + "step": 11892 + }, + { + "epoch": 4.390228894905243, + "grad_norm": 0.9370100498199463, + "learning_rate": 8.449115241912464e-07, + "loss": 0.5564, + "step": 11893 + }, + { + "epoch": 4.390598080236279, + "grad_norm": 0.9308733344078064, + "learning_rate": 8.445470308783594e-07, + "loss": 0.5708, + "step": 11894 + }, + { + "epoch": 4.390967265567315, + "grad_norm": 0.9443077445030212, + "learning_rate": 8.441826002238454e-07, + "loss": 0.6067, + "step": 11895 + }, + { + "epoch": 4.391336450898351, + "grad_norm": 1.0098919868469238, + "learning_rate": 8.438182322414976e-07, + "loss": 0.6125, + "step": 11896 + }, + { + "epoch": 4.391705636229387, + "grad_norm": 0.9544092416763306, + "learning_rate": 8.434539269451061e-07, + "loss": 0.565, + "step": 11897 + }, + { + "epoch": 4.392074821560423, + "grad_norm": 0.947734534740448, + "learning_rate": 8.430896843484615e-07, + "loss": 0.5892, + "step": 11898 + }, + { + "epoch": 4.392444006891459, + "grad_norm": 0.9482125043869019, + "learning_rate": 8.427255044653496e-07, + "loss": 0.6169, + "step": 11899 + }, + { + "epoch": 4.392813192222496, + "grad_norm": 0.9482619762420654, + "learning_rate": 8.423613873095531e-07, + "loss": 0.5595, + "step": 11900 + }, + { + "epoch": 4.393182377553532, + "grad_norm": 0.9379405975341797, + "learning_rate": 8.419973328948561e-07, + "loss": 0.5492, + "step": 11901 + }, + { + "epoch": 4.393551562884568, + "grad_norm": 0.9786723852157593, + "learning_rate": 8.416333412350358e-07, + "loss": 0.6099, + "step": 11902 + }, + { + "epoch": 4.393920748215605, + "grad_norm": 0.9217785000801086, + "learning_rate": 8.412694123438708e-07, + "loss": 0.5492, + "step": 11903 + }, + { + "epoch": 4.3942899335466405, + "grad_norm": 0.9711700677871704, + "learning_rate": 8.409055462351354e-07, + "loss": 0.5886, + "step": 11904 + }, + { + "epoch": 4.3946591188776765, + "grad_norm": 0.9604065418243408, + "learning_rate": 8.405417429226004e-07, + "loss": 0.5931, + "step": 11905 + }, + { + "epoch": 4.395028304208712, + "grad_norm": 0.9484656453132629, + "learning_rate": 8.401780024200377e-07, + "loss": 0.5805, + "step": 11906 + }, + { + "epoch": 4.395397489539749, + "grad_norm": 0.9937628507614136, + "learning_rate": 8.398143247412135e-07, + "loss": 0.5771, + "step": 11907 + }, + { + "epoch": 4.395766674870785, + "grad_norm": 0.9378693699836731, + "learning_rate": 8.394507098998925e-07, + "loss": 0.5758, + "step": 11908 + }, + { + "epoch": 4.396135860201821, + "grad_norm": 0.9644753336906433, + "learning_rate": 8.390871579098389e-07, + "loss": 0.5766, + "step": 11909 + }, + { + "epoch": 4.396505045532858, + "grad_norm": 0.9414392113685608, + "learning_rate": 8.387236687848121e-07, + "loss": 0.5215, + "step": 11910 + }, + { + "epoch": 4.396874230863894, + "grad_norm": 0.9513487815856934, + "learning_rate": 8.383602425385701e-07, + "loss": 0.5696, + "step": 11911 + }, + { + "epoch": 4.39724341619493, + "grad_norm": 0.9593233466148376, + "learning_rate": 8.379968791848678e-07, + "loss": 0.5599, + "step": 11912 + }, + { + "epoch": 4.3976126015259664, + "grad_norm": 0.9443486928939819, + "learning_rate": 8.376335787374589e-07, + "loss": 0.5775, + "step": 11913 + }, + { + "epoch": 4.397981786857002, + "grad_norm": 0.9264695644378662, + "learning_rate": 8.372703412100952e-07, + "loss": 0.5727, + "step": 11914 + }, + { + "epoch": 4.398350972188038, + "grad_norm": 0.9425567984580994, + "learning_rate": 8.369071666165246e-07, + "loss": 0.57, + "step": 11915 + }, + { + "epoch": 4.398720157519074, + "grad_norm": 0.9105877876281738, + "learning_rate": 8.365440549704925e-07, + "loss": 0.5523, + "step": 11916 + }, + { + "epoch": 4.399089342850111, + "grad_norm": 0.9447840452194214, + "learning_rate": 8.361810062857426e-07, + "loss": 0.5591, + "step": 11917 + }, + { + "epoch": 4.399458528181147, + "grad_norm": 0.9462909698486328, + "learning_rate": 8.358180205760169e-07, + "loss": 0.583, + "step": 11918 + }, + { + "epoch": 4.399827713512183, + "grad_norm": 0.960769534111023, + "learning_rate": 8.354550978550541e-07, + "loss": 0.5567, + "step": 11919 + }, + { + "epoch": 4.40019689884322, + "grad_norm": 0.9502401351928711, + "learning_rate": 8.350922381365897e-07, + "loss": 0.5811, + "step": 11920 + }, + { + "epoch": 4.4005660841742555, + "grad_norm": 0.9759312868118286, + "learning_rate": 8.347294414343593e-07, + "loss": 0.5642, + "step": 11921 + }, + { + "epoch": 4.4009352695052915, + "grad_norm": 0.9571315050125122, + "learning_rate": 8.343667077620932e-07, + "loss": 0.5431, + "step": 11922 + }, + { + "epoch": 4.401304454836328, + "grad_norm": 0.9386094212532043, + "learning_rate": 8.340040371335223e-07, + "loss": 0.5624, + "step": 11923 + }, + { + "epoch": 4.401673640167364, + "grad_norm": 0.9853322505950928, + "learning_rate": 8.336414295623729e-07, + "loss": 0.5932, + "step": 11924 + }, + { + "epoch": 4.4020428254984, + "grad_norm": 0.941765308380127, + "learning_rate": 8.332788850623688e-07, + "loss": 0.5899, + "step": 11925 + }, + { + "epoch": 4.402412010829436, + "grad_norm": 0.9730892181396484, + "learning_rate": 8.329164036472334e-07, + "loss": 0.6062, + "step": 11926 + }, + { + "epoch": 4.402781196160473, + "grad_norm": 0.9666173458099365, + "learning_rate": 8.32553985330686e-07, + "loss": 0.5873, + "step": 11927 + }, + { + "epoch": 4.403150381491509, + "grad_norm": 0.9792605638504028, + "learning_rate": 8.321916301264435e-07, + "loss": 0.5992, + "step": 11928 + }, + { + "epoch": 4.403519566822545, + "grad_norm": 0.9548709392547607, + "learning_rate": 8.31829338048222e-07, + "loss": 0.6003, + "step": 11929 + }, + { + "epoch": 4.403888752153581, + "grad_norm": 0.9787153005599976, + "learning_rate": 8.314671091097338e-07, + "loss": 0.5998, + "step": 11930 + }, + { + "epoch": 4.404257937484617, + "grad_norm": 0.9611805081367493, + "learning_rate": 8.311049433246887e-07, + "loss": 0.5924, + "step": 11931 + }, + { + "epoch": 4.404627122815653, + "grad_norm": 0.9517995119094849, + "learning_rate": 8.307428407067938e-07, + "loss": 0.552, + "step": 11932 + }, + { + "epoch": 4.404996308146689, + "grad_norm": 0.9750276207923889, + "learning_rate": 8.303808012697559e-07, + "loss": 0.5298, + "step": 11933 + }, + { + "epoch": 4.405365493477726, + "grad_norm": 0.9533432126045227, + "learning_rate": 8.300188250272784e-07, + "loss": 0.5863, + "step": 11934 + }, + { + "epoch": 4.405734678808762, + "grad_norm": 0.9836182594299316, + "learning_rate": 8.296569119930614e-07, + "loss": 0.5818, + "step": 11935 + }, + { + "epoch": 4.406103864139798, + "grad_norm": 0.9723351001739502, + "learning_rate": 8.292950621808022e-07, + "loss": 0.5726, + "step": 11936 + }, + { + "epoch": 4.406473049470835, + "grad_norm": 0.9529734253883362, + "learning_rate": 8.289332756041985e-07, + "loss": 0.5528, + "step": 11937 + }, + { + "epoch": 4.4068422348018705, + "grad_norm": 0.9403586387634277, + "learning_rate": 8.285715522769427e-07, + "loss": 0.619, + "step": 11938 + }, + { + "epoch": 4.4072114201329065, + "grad_norm": 0.986297607421875, + "learning_rate": 8.282098922127262e-07, + "loss": 0.5931, + "step": 11939 + }, + { + "epoch": 4.407580605463943, + "grad_norm": 0.978990375995636, + "learning_rate": 8.27848295425237e-07, + "loss": 0.5573, + "step": 11940 + }, + { + "epoch": 4.407949790794979, + "grad_norm": 0.996968150138855, + "learning_rate": 8.27486761928162e-07, + "loss": 0.5738, + "step": 11941 + }, + { + "epoch": 4.408318976126015, + "grad_norm": 0.9546469449996948, + "learning_rate": 8.271252917351863e-07, + "loss": 0.5714, + "step": 11942 + }, + { + "epoch": 4.408688161457052, + "grad_norm": 0.961959719657898, + "learning_rate": 8.267638848599901e-07, + "loss": 0.5945, + "step": 11943 + }, + { + "epoch": 4.409057346788088, + "grad_norm": 0.957392692565918, + "learning_rate": 8.264025413162527e-07, + "loss": 0.5684, + "step": 11944 + }, + { + "epoch": 4.409426532119124, + "grad_norm": 0.9810359477996826, + "learning_rate": 8.260412611176502e-07, + "loss": 0.551, + "step": 11945 + }, + { + "epoch": 4.40979571745016, + "grad_norm": 0.9024922847747803, + "learning_rate": 8.256800442778584e-07, + "loss": 0.5601, + "step": 11946 + }, + { + "epoch": 4.410164902781196, + "grad_norm": 0.9471371173858643, + "learning_rate": 8.253188908105486e-07, + "loss": 0.5612, + "step": 11947 + }, + { + "epoch": 4.410534088112232, + "grad_norm": 0.9477766752243042, + "learning_rate": 8.249578007293893e-07, + "loss": 0.5288, + "step": 11948 + }, + { + "epoch": 4.410903273443268, + "grad_norm": 0.9335000514984131, + "learning_rate": 8.245967740480496e-07, + "loss": 0.5661, + "step": 11949 + }, + { + "epoch": 4.411272458774305, + "grad_norm": 0.9617900252342224, + "learning_rate": 8.242358107801931e-07, + "loss": 0.5785, + "step": 11950 + }, + { + "epoch": 4.411641644105341, + "grad_norm": 0.9740603566169739, + "learning_rate": 8.238749109394814e-07, + "loss": 0.5597, + "step": 11951 + }, + { + "epoch": 4.412010829436377, + "grad_norm": 0.9548958539962769, + "learning_rate": 8.235140745395765e-07, + "loss": 0.5786, + "step": 11952 + }, + { + "epoch": 4.412380014767413, + "grad_norm": 0.9524064064025879, + "learning_rate": 8.231533015941334e-07, + "loss": 0.5787, + "step": 11953 + }, + { + "epoch": 4.41274920009845, + "grad_norm": 0.9429553747177124, + "learning_rate": 8.227925921168098e-07, + "loss": 0.6071, + "step": 11954 + }, + { + "epoch": 4.4131183854294855, + "grad_norm": 0.9947468638420105, + "learning_rate": 8.224319461212571e-07, + "loss": 0.5648, + "step": 11955 + }, + { + "epoch": 4.4134875707605215, + "grad_norm": 0.9689159393310547, + "learning_rate": 8.220713636211253e-07, + "loss": 0.5762, + "step": 11956 + }, + { + "epoch": 4.413856756091558, + "grad_norm": 0.9395607709884644, + "learning_rate": 8.217108446300631e-07, + "loss": 0.5437, + "step": 11957 + }, + { + "epoch": 4.414225941422594, + "grad_norm": 0.9085915088653564, + "learning_rate": 8.213503891617161e-07, + "loss": 0.5599, + "step": 11958 + }, + { + "epoch": 4.41459512675363, + "grad_norm": 0.953592836856842, + "learning_rate": 8.209899972297272e-07, + "loss": 0.5393, + "step": 11959 + }, + { + "epoch": 4.414964312084667, + "grad_norm": 0.9465916156768799, + "learning_rate": 8.206296688477358e-07, + "loss": 0.5858, + "step": 11960 + }, + { + "epoch": 4.415333497415703, + "grad_norm": 0.9454624652862549, + "learning_rate": 8.202694040293819e-07, + "loss": 0.5916, + "step": 11961 + }, + { + "epoch": 4.415702682746739, + "grad_norm": 0.9754135608673096, + "learning_rate": 8.199092027883013e-07, + "loss": 0.558, + "step": 11962 + }, + { + "epoch": 4.416071868077775, + "grad_norm": 0.9273045659065247, + "learning_rate": 8.195490651381274e-07, + "loss": 0.5696, + "step": 11963 + }, + { + "epoch": 4.416441053408811, + "grad_norm": 0.9525840878486633, + "learning_rate": 8.191889910924908e-07, + "loss": 0.539, + "step": 11964 + }, + { + "epoch": 4.416810238739847, + "grad_norm": 0.90742027759552, + "learning_rate": 8.188289806650198e-07, + "loss": 0.5725, + "step": 11965 + }, + { + "epoch": 4.417179424070883, + "grad_norm": 0.9460955262184143, + "learning_rate": 8.184690338693419e-07, + "loss": 0.5694, + "step": 11966 + }, + { + "epoch": 4.41754860940192, + "grad_norm": 0.9510526061058044, + "learning_rate": 8.181091507190806e-07, + "loss": 0.5682, + "step": 11967 + }, + { + "epoch": 4.417917794732956, + "grad_norm": 0.9338508248329163, + "learning_rate": 8.177493312278561e-07, + "loss": 0.5877, + "step": 11968 + }, + { + "epoch": 4.418286980063992, + "grad_norm": 0.9440329074859619, + "learning_rate": 8.173895754092895e-07, + "loss": 0.5152, + "step": 11969 + }, + { + "epoch": 4.418656165395029, + "grad_norm": 0.9421162605285645, + "learning_rate": 8.170298832769962e-07, + "loss": 0.5804, + "step": 11970 + }, + { + "epoch": 4.419025350726065, + "grad_norm": 0.9834299087524414, + "learning_rate": 8.166702548445901e-07, + "loss": 0.5747, + "step": 11971 + }, + { + "epoch": 4.4193945360571005, + "grad_norm": 1.0747253894805908, + "learning_rate": 8.163106901256843e-07, + "loss": 0.5886, + "step": 11972 + }, + { + "epoch": 4.4197637213881364, + "grad_norm": 0.9668318629264832, + "learning_rate": 8.159511891338868e-07, + "loss": 0.5729, + "step": 11973 + }, + { + "epoch": 4.420132906719173, + "grad_norm": 0.9530134201049805, + "learning_rate": 8.15591751882806e-07, + "loss": 0.5773, + "step": 11974 + }, + { + "epoch": 4.420502092050209, + "grad_norm": 1.0370879173278809, + "learning_rate": 8.152323783860458e-07, + "loss": 0.5934, + "step": 11975 + }, + { + "epoch": 4.420871277381245, + "grad_norm": 0.9498851895332336, + "learning_rate": 8.148730686572076e-07, + "loss": 0.5721, + "step": 11976 + }, + { + "epoch": 4.421240462712282, + "grad_norm": 0.9468571543693542, + "learning_rate": 8.145138227098925e-07, + "loss": 0.544, + "step": 11977 + }, + { + "epoch": 4.421609648043318, + "grad_norm": 0.9544144868850708, + "learning_rate": 8.141546405576975e-07, + "loss": 0.5682, + "step": 11978 + }, + { + "epoch": 4.421978833374354, + "grad_norm": 0.9606319069862366, + "learning_rate": 8.137955222142163e-07, + "loss": 0.5558, + "step": 11979 + }, + { + "epoch": 4.4223480187053905, + "grad_norm": 0.9438542127609253, + "learning_rate": 8.134364676930434e-07, + "loss": 0.5743, + "step": 11980 + }, + { + "epoch": 4.422717204036426, + "grad_norm": 0.9304824471473694, + "learning_rate": 8.130774770077671e-07, + "loss": 0.6118, + "step": 11981 + }, + { + "epoch": 4.423086389367462, + "grad_norm": 0.9608962535858154, + "learning_rate": 8.127185501719764e-07, + "loss": 0.5815, + "step": 11982 + }, + { + "epoch": 4.423455574698498, + "grad_norm": 0.9866334795951843, + "learning_rate": 8.123596871992565e-07, + "loss": 0.5501, + "step": 11983 + }, + { + "epoch": 4.423824760029535, + "grad_norm": 0.9428991079330444, + "learning_rate": 8.120008881031888e-07, + "loss": 0.5858, + "step": 11984 + }, + { + "epoch": 4.424193945360571, + "grad_norm": 0.9565616846084595, + "learning_rate": 8.116421528973556e-07, + "loss": 0.5835, + "step": 11985 + }, + { + "epoch": 4.424563130691607, + "grad_norm": 0.9352946281433105, + "learning_rate": 8.112834815953338e-07, + "loss": 0.5648, + "step": 11986 + }, + { + "epoch": 4.424932316022644, + "grad_norm": 0.9450311660766602, + "learning_rate": 8.109248742106995e-07, + "loss": 0.5547, + "step": 11987 + }, + { + "epoch": 4.42530150135368, + "grad_norm": 0.9785549640655518, + "learning_rate": 8.10566330757025e-07, + "loss": 0.5882, + "step": 11988 + }, + { + "epoch": 4.4256706866847155, + "grad_norm": 0.9681649804115295, + "learning_rate": 8.102078512478814e-07, + "loss": 0.5782, + "step": 11989 + }, + { + "epoch": 4.426039872015751, + "grad_norm": 0.9669166803359985, + "learning_rate": 8.098494356968389e-07, + "loss": 0.5498, + "step": 11990 + }, + { + "epoch": 4.426409057346788, + "grad_norm": 0.9635892510414124, + "learning_rate": 8.094910841174603e-07, + "loss": 0.5873, + "step": 11991 + }, + { + "epoch": 4.426778242677824, + "grad_norm": 0.9500514268875122, + "learning_rate": 8.091327965233114e-07, + "loss": 0.5775, + "step": 11992 + }, + { + "epoch": 4.42714742800886, + "grad_norm": 0.9627494215965271, + "learning_rate": 8.08774572927952e-07, + "loss": 0.5945, + "step": 11993 + }, + { + "epoch": 4.427516613339897, + "grad_norm": 0.9429371356964111, + "learning_rate": 8.084164133449418e-07, + "loss": 0.5536, + "step": 11994 + }, + { + "epoch": 4.427885798670933, + "grad_norm": 0.9583233594894409, + "learning_rate": 8.080583177878365e-07, + "loss": 0.5818, + "step": 11995 + }, + { + "epoch": 4.428254984001969, + "grad_norm": 0.9172466397285461, + "learning_rate": 8.077002862701891e-07, + "loss": 0.5558, + "step": 11996 + }, + { + "epoch": 4.4286241693330055, + "grad_norm": 0.9699228405952454, + "learning_rate": 8.073423188055527e-07, + "loss": 0.5758, + "step": 11997 + }, + { + "epoch": 4.428993354664041, + "grad_norm": 0.9464989304542542, + "learning_rate": 8.069844154074752e-07, + "loss": 0.5736, + "step": 11998 + }, + { + "epoch": 4.429362539995077, + "grad_norm": 0.9933573007583618, + "learning_rate": 8.066265760895023e-07, + "loss": 0.5292, + "step": 11999 + }, + { + "epoch": 4.429731725326114, + "grad_norm": 0.9191693663597107, + "learning_rate": 8.0626880086518e-07, + "loss": 0.5803, + "step": 12000 + }, + { + "epoch": 4.43010091065715, + "grad_norm": 0.9506992697715759, + "learning_rate": 8.059110897480482e-07, + "loss": 0.576, + "step": 12001 + }, + { + "epoch": 4.430470095988186, + "grad_norm": 0.9926602840423584, + "learning_rate": 8.055534427516479e-07, + "loss": 0.5496, + "step": 12002 + }, + { + "epoch": 4.430839281319222, + "grad_norm": 0.9325423836708069, + "learning_rate": 8.051958598895149e-07, + "loss": 0.5695, + "step": 12003 + }, + { + "epoch": 4.431208466650259, + "grad_norm": 0.9665876030921936, + "learning_rate": 8.048383411751831e-07, + "loss": 0.5808, + "step": 12004 + }, + { + "epoch": 4.431577651981295, + "grad_norm": 0.9871367812156677, + "learning_rate": 8.044808866221856e-07, + "loss": 0.6045, + "step": 12005 + }, + { + "epoch": 4.4319468373123305, + "grad_norm": 0.9660100340843201, + "learning_rate": 8.041234962440516e-07, + "loss": 0.5792, + "step": 12006 + }, + { + "epoch": 4.432316022643367, + "grad_norm": 0.9592208862304688, + "learning_rate": 8.037661700543081e-07, + "loss": 0.5838, + "step": 12007 + }, + { + "epoch": 4.432685207974403, + "grad_norm": 0.9516308903694153, + "learning_rate": 8.034089080664787e-07, + "loss": 0.5282, + "step": 12008 + }, + { + "epoch": 4.433054393305439, + "grad_norm": 0.9451471567153931, + "learning_rate": 8.030517102940868e-07, + "loss": 0.5789, + "step": 12009 + }, + { + "epoch": 4.433423578636475, + "grad_norm": 0.9803107976913452, + "learning_rate": 8.026945767506531e-07, + "loss": 0.5979, + "step": 12010 + }, + { + "epoch": 4.433792763967512, + "grad_norm": 0.9300100803375244, + "learning_rate": 8.023375074496936e-07, + "loss": 0.5588, + "step": 12011 + }, + { + "epoch": 4.434161949298548, + "grad_norm": 0.9663540720939636, + "learning_rate": 8.01980502404723e-07, + "loss": 0.5873, + "step": 12012 + }, + { + "epoch": 4.434531134629584, + "grad_norm": 0.9443596005439758, + "learning_rate": 8.016235616292556e-07, + "loss": 0.5481, + "step": 12013 + }, + { + "epoch": 4.4349003199606205, + "grad_norm": 0.9325382709503174, + "learning_rate": 8.012666851368001e-07, + "loss": 0.5976, + "step": 12014 + }, + { + "epoch": 4.435269505291656, + "grad_norm": 0.9475048184394836, + "learning_rate": 8.009098729408643e-07, + "loss": 0.5703, + "step": 12015 + }, + { + "epoch": 4.435638690622692, + "grad_norm": 0.9487609267234802, + "learning_rate": 8.005531250549531e-07, + "loss": 0.5696, + "step": 12016 + }, + { + "epoch": 4.436007875953729, + "grad_norm": 0.98902428150177, + "learning_rate": 8.001964414925694e-07, + "loss": 0.5975, + "step": 12017 + }, + { + "epoch": 4.436377061284765, + "grad_norm": 0.9825493097305298, + "learning_rate": 7.99839822267216e-07, + "loss": 0.5695, + "step": 12018 + }, + { + "epoch": 4.436746246615801, + "grad_norm": 0.944680392742157, + "learning_rate": 7.994832673923866e-07, + "loss": 0.5734, + "step": 12019 + }, + { + "epoch": 4.437115431946838, + "grad_norm": 0.9612233638763428, + "learning_rate": 7.9912677688158e-07, + "loss": 0.6068, + "step": 12020 + }, + { + "epoch": 4.437484617277874, + "grad_norm": 0.9587089419364929, + "learning_rate": 7.987703507482872e-07, + "loss": 0.5979, + "step": 12021 + }, + { + "epoch": 4.43785380260891, + "grad_norm": 0.9435788989067078, + "learning_rate": 7.984139890060006e-07, + "loss": 0.5948, + "step": 12022 + }, + { + "epoch": 4.4382229879399455, + "grad_norm": 0.9726777672767639, + "learning_rate": 7.980576916682073e-07, + "loss": 0.5709, + "step": 12023 + }, + { + "epoch": 4.438592173270982, + "grad_norm": 0.9539467096328735, + "learning_rate": 7.977014587483925e-07, + "loss": 0.5623, + "step": 12024 + }, + { + "epoch": 4.438961358602018, + "grad_norm": 0.9521073698997498, + "learning_rate": 7.973452902600412e-07, + "loss": 0.5852, + "step": 12025 + }, + { + "epoch": 4.439330543933054, + "grad_norm": 0.9868634939193726, + "learning_rate": 7.969891862166332e-07, + "loss": 0.5805, + "step": 12026 + }, + { + "epoch": 4.439699729264091, + "grad_norm": 0.9280208349227905, + "learning_rate": 7.96633146631646e-07, + "loss": 0.5686, + "step": 12027 + }, + { + "epoch": 4.440068914595127, + "grad_norm": 1.0029083490371704, + "learning_rate": 7.962771715185577e-07, + "loss": 0.5754, + "step": 12028 + }, + { + "epoch": 4.440438099926163, + "grad_norm": 0.9374696612358093, + "learning_rate": 7.959212608908398e-07, + "loss": 0.594, + "step": 12029 + }, + { + "epoch": 4.440807285257199, + "grad_norm": 0.945218026638031, + "learning_rate": 7.955654147619652e-07, + "loss": 0.5461, + "step": 12030 + }, + { + "epoch": 4.4411764705882355, + "grad_norm": 0.9608343243598938, + "learning_rate": 7.952096331454018e-07, + "loss": 0.5721, + "step": 12031 + }, + { + "epoch": 4.441545655919271, + "grad_norm": 0.9539564847946167, + "learning_rate": 7.948539160546148e-07, + "loss": 0.5603, + "step": 12032 + }, + { + "epoch": 4.441914841250307, + "grad_norm": 0.9760329723358154, + "learning_rate": 7.944982635030696e-07, + "loss": 0.5794, + "step": 12033 + }, + { + "epoch": 4.442284026581344, + "grad_norm": 0.9296929836273193, + "learning_rate": 7.941426755042272e-07, + "loss": 0.5714, + "step": 12034 + }, + { + "epoch": 4.44265321191238, + "grad_norm": 0.9747658371925354, + "learning_rate": 7.937871520715459e-07, + "loss": 0.5803, + "step": 12035 + }, + { + "epoch": 4.443022397243416, + "grad_norm": 0.9611771106719971, + "learning_rate": 7.934316932184819e-07, + "loss": 0.5711, + "step": 12036 + }, + { + "epoch": 4.443391582574453, + "grad_norm": 0.9648510813713074, + "learning_rate": 7.930762989584894e-07, + "loss": 0.6026, + "step": 12037 + }, + { + "epoch": 4.443760767905489, + "grad_norm": 0.9454135298728943, + "learning_rate": 7.92720969305022e-07, + "loss": 0.5785, + "step": 12038 + }, + { + "epoch": 4.444129953236525, + "grad_norm": 0.946640133857727, + "learning_rate": 7.923657042715254e-07, + "loss": 0.5937, + "step": 12039 + }, + { + "epoch": 4.4444991385675605, + "grad_norm": 0.9527620077133179, + "learning_rate": 7.92010503871449e-07, + "loss": 0.5748, + "step": 12040 + }, + { + "epoch": 4.444868323898597, + "grad_norm": 0.9448414444923401, + "learning_rate": 7.916553681182349e-07, + "loss": 0.5473, + "step": 12041 + }, + { + "epoch": 4.445237509229633, + "grad_norm": 0.9417122602462769, + "learning_rate": 7.91300297025327e-07, + "loss": 0.5747, + "step": 12042 + }, + { + "epoch": 4.445606694560669, + "grad_norm": 0.9753829836845398, + "learning_rate": 7.909452906061634e-07, + "loss": 0.5777, + "step": 12043 + }, + { + "epoch": 4.445975879891706, + "grad_norm": 0.9527203440666199, + "learning_rate": 7.905903488741804e-07, + "loss": 0.5654, + "step": 12044 + }, + { + "epoch": 4.446345065222742, + "grad_norm": 0.9633192420005798, + "learning_rate": 7.902354718428132e-07, + "loss": 0.5539, + "step": 12045 + }, + { + "epoch": 4.446714250553778, + "grad_norm": 0.9476871490478516, + "learning_rate": 7.898806595254954e-07, + "loss": 0.5723, + "step": 12046 + }, + { + "epoch": 4.447083435884815, + "grad_norm": 0.9599629640579224, + "learning_rate": 7.895259119356532e-07, + "loss": 0.5454, + "step": 12047 + }, + { + "epoch": 4.4474526212158505, + "grad_norm": 0.9552610516548157, + "learning_rate": 7.891712290867165e-07, + "loss": 0.5881, + "step": 12048 + }, + { + "epoch": 4.447821806546886, + "grad_norm": 0.9455417990684509, + "learning_rate": 7.888166109921078e-07, + "loss": 0.5952, + "step": 12049 + }, + { + "epoch": 4.448190991877922, + "grad_norm": 0.9906212091445923, + "learning_rate": 7.88462057665251e-07, + "loss": 0.6042, + "step": 12050 + }, + { + "epoch": 4.448560177208959, + "grad_norm": 0.9464402794837952, + "learning_rate": 7.881075691195655e-07, + "loss": 0.5575, + "step": 12051 + }, + { + "epoch": 4.448929362539995, + "grad_norm": 0.9658629298210144, + "learning_rate": 7.877531453684672e-07, + "loss": 0.5843, + "step": 12052 + }, + { + "epoch": 4.449298547871031, + "grad_norm": 0.9435630440711975, + "learning_rate": 7.873987864253727e-07, + "loss": 0.5684, + "step": 12053 + }, + { + "epoch": 4.449667733202068, + "grad_norm": 0.9414200782775879, + "learning_rate": 7.870444923036938e-07, + "loss": 0.5501, + "step": 12054 + }, + { + "epoch": 4.450036918533104, + "grad_norm": 0.9745848774909973, + "learning_rate": 7.866902630168394e-07, + "loss": 0.5868, + "step": 12055 + }, + { + "epoch": 4.45040610386414, + "grad_norm": 0.9450812935829163, + "learning_rate": 7.863360985782184e-07, + "loss": 0.5552, + "step": 12056 + }, + { + "epoch": 4.450775289195176, + "grad_norm": 0.9280152916908264, + "learning_rate": 7.859819990012347e-07, + "loss": 0.5471, + "step": 12057 + }, + { + "epoch": 4.451144474526212, + "grad_norm": 0.9787061214447021, + "learning_rate": 7.856279642992928e-07, + "loss": 0.6025, + "step": 12058 + }, + { + "epoch": 4.451513659857248, + "grad_norm": 0.9728091359138489, + "learning_rate": 7.852739944857901e-07, + "loss": 0.5775, + "step": 12059 + }, + { + "epoch": 4.451882845188284, + "grad_norm": 0.941415011882782, + "learning_rate": 7.849200895741252e-07, + "loss": 0.5798, + "step": 12060 + }, + { + "epoch": 4.452252030519321, + "grad_norm": 0.9596430063247681, + "learning_rate": 7.845662495776945e-07, + "loss": 0.5678, + "step": 12061 + }, + { + "epoch": 4.452621215850357, + "grad_norm": 0.9451303482055664, + "learning_rate": 7.8421247450989e-07, + "loss": 0.574, + "step": 12062 + }, + { + "epoch": 4.452990401181393, + "grad_norm": 0.9216892719268799, + "learning_rate": 7.838587643841016e-07, + "loss": 0.6001, + "step": 12063 + }, + { + "epoch": 4.45335958651243, + "grad_norm": 0.9595593810081482, + "learning_rate": 7.835051192137169e-07, + "loss": 0.5915, + "step": 12064 + }, + { + "epoch": 4.4537287718434655, + "grad_norm": 0.9548695087432861, + "learning_rate": 7.831515390121214e-07, + "loss": 0.572, + "step": 12065 + }, + { + "epoch": 4.454097957174501, + "grad_norm": 0.9733303189277649, + "learning_rate": 7.827980237926999e-07, + "loss": 0.5682, + "step": 12066 + }, + { + "epoch": 4.454467142505537, + "grad_norm": 0.9412005543708801, + "learning_rate": 7.824445735688299e-07, + "loss": 0.5507, + "step": 12067 + }, + { + "epoch": 4.454836327836574, + "grad_norm": 0.9599558711051941, + "learning_rate": 7.820911883538912e-07, + "loss": 0.5717, + "step": 12068 + }, + { + "epoch": 4.45520551316761, + "grad_norm": 0.9555858373641968, + "learning_rate": 7.81737868161258e-07, + "loss": 0.5976, + "step": 12069 + }, + { + "epoch": 4.455574698498646, + "grad_norm": 0.9541544914245605, + "learning_rate": 7.813846130043054e-07, + "loss": 0.6096, + "step": 12070 + }, + { + "epoch": 4.455943883829683, + "grad_norm": 0.9877303838729858, + "learning_rate": 7.810314228964024e-07, + "loss": 0.5931, + "step": 12071 + }, + { + "epoch": 4.456313069160719, + "grad_norm": 0.937654435634613, + "learning_rate": 7.80678297850917e-07, + "loss": 0.5809, + "step": 12072 + }, + { + "epoch": 4.456682254491755, + "grad_norm": 1.0402960777282715, + "learning_rate": 7.803252378812163e-07, + "loss": 0.5855, + "step": 12073 + }, + { + "epoch": 4.457051439822791, + "grad_norm": 0.9579830169677734, + "learning_rate": 7.799722430006623e-07, + "loss": 0.5833, + "step": 12074 + }, + { + "epoch": 4.457420625153827, + "grad_norm": 0.981227457523346, + "learning_rate": 7.796193132226154e-07, + "loss": 0.6175, + "step": 12075 + }, + { + "epoch": 4.457789810484863, + "grad_norm": 0.9797912836074829, + "learning_rate": 7.792664485604354e-07, + "loss": 0.5703, + "step": 12076 + }, + { + "epoch": 4.4581589958159, + "grad_norm": 0.9466975927352905, + "learning_rate": 7.789136490274762e-07, + "loss": 0.5899, + "step": 12077 + }, + { + "epoch": 4.458528181146936, + "grad_norm": 0.9620150327682495, + "learning_rate": 7.78560914637094e-07, + "loss": 0.5402, + "step": 12078 + }, + { + "epoch": 4.458897366477972, + "grad_norm": 0.9793500900268555, + "learning_rate": 7.782082454026363e-07, + "loss": 0.5953, + "step": 12079 + }, + { + "epoch": 4.459266551809008, + "grad_norm": 0.9859143495559692, + "learning_rate": 7.77855641337453e-07, + "loss": 0.6015, + "step": 12080 + }, + { + "epoch": 4.459635737140045, + "grad_norm": 0.9464289546012878, + "learning_rate": 7.77503102454891e-07, + "loss": 0.5656, + "step": 12081 + }, + { + "epoch": 4.4600049224710805, + "grad_norm": 0.9500905275344849, + "learning_rate": 7.771506287682929e-07, + "loss": 0.575, + "step": 12082 + }, + { + "epoch": 4.460374107802116, + "grad_norm": 0.9671605229377747, + "learning_rate": 7.76798220290999e-07, + "loss": 0.5809, + "step": 12083 + }, + { + "epoch": 4.460743293133153, + "grad_norm": 0.9451783895492554, + "learning_rate": 7.764458770363495e-07, + "loss": 0.558, + "step": 12084 + }, + { + "epoch": 4.461112478464189, + "grad_norm": 1.0243314504623413, + "learning_rate": 7.760935990176785e-07, + "loss": 0.6273, + "step": 12085 + }, + { + "epoch": 4.461481663795225, + "grad_norm": 0.9778937101364136, + "learning_rate": 7.757413862483227e-07, + "loss": 0.5707, + "step": 12086 + }, + { + "epoch": 4.461850849126261, + "grad_norm": 0.951260507106781, + "learning_rate": 7.753892387416093e-07, + "loss": 0.5798, + "step": 12087 + }, + { + "epoch": 4.462220034457298, + "grad_norm": 0.972374439239502, + "learning_rate": 7.750371565108692e-07, + "loss": 0.5759, + "step": 12088 + }, + { + "epoch": 4.462589219788334, + "grad_norm": 0.9329587817192078, + "learning_rate": 7.746851395694291e-07, + "loss": 0.5767, + "step": 12089 + }, + { + "epoch": 4.46295840511937, + "grad_norm": 0.976928174495697, + "learning_rate": 7.743331879306117e-07, + "loss": 0.5757, + "step": 12090 + }, + { + "epoch": 4.463327590450406, + "grad_norm": 0.932277262210846, + "learning_rate": 7.739813016077388e-07, + "loss": 0.5763, + "step": 12091 + }, + { + "epoch": 4.463696775781442, + "grad_norm": 0.943439781665802, + "learning_rate": 7.736294806141279e-07, + "loss": 0.5543, + "step": 12092 + }, + { + "epoch": 4.464065961112478, + "grad_norm": 0.9397492408752441, + "learning_rate": 7.732777249630963e-07, + "loss": 0.6104, + "step": 12093 + }, + { + "epoch": 4.464435146443515, + "grad_norm": 0.9782150983810425, + "learning_rate": 7.729260346679595e-07, + "loss": 0.553, + "step": 12094 + }, + { + "epoch": 4.464804331774551, + "grad_norm": 0.9700337052345276, + "learning_rate": 7.725744097420257e-07, + "loss": 0.5868, + "step": 12095 + }, + { + "epoch": 4.465173517105587, + "grad_norm": 0.9617190957069397, + "learning_rate": 7.722228501986059e-07, + "loss": 0.5909, + "step": 12096 + }, + { + "epoch": 4.465542702436624, + "grad_norm": 0.9566123485565186, + "learning_rate": 7.718713560510052e-07, + "loss": 0.6049, + "step": 12097 + }, + { + "epoch": 4.46591188776766, + "grad_norm": 0.9785389304161072, + "learning_rate": 7.715199273125292e-07, + "loss": 0.5826, + "step": 12098 + }, + { + "epoch": 4.4662810730986955, + "grad_norm": 0.9809549450874329, + "learning_rate": 7.711685639964784e-07, + "loss": 0.5927, + "step": 12099 + }, + { + "epoch": 4.466650258429731, + "grad_norm": 0.9499883055686951, + "learning_rate": 7.708172661161509e-07, + "loss": 0.5762, + "step": 12100 + }, + { + "epoch": 4.467019443760768, + "grad_norm": 0.9351134300231934, + "learning_rate": 7.704660336848452e-07, + "loss": 0.5744, + "step": 12101 + }, + { + "epoch": 4.467388629091804, + "grad_norm": 0.9400444626808167, + "learning_rate": 7.70114866715854e-07, + "loss": 0.5834, + "step": 12102 + }, + { + "epoch": 4.46775781442284, + "grad_norm": 0.9654927849769592, + "learning_rate": 7.697637652224685e-07, + "loss": 0.6236, + "step": 12103 + }, + { + "epoch": 4.468126999753877, + "grad_norm": 0.9821814894676208, + "learning_rate": 7.69412729217979e-07, + "loss": 0.5921, + "step": 12104 + }, + { + "epoch": 4.468496185084913, + "grad_norm": 0.9861723184585571, + "learning_rate": 7.69061758715671e-07, + "loss": 0.5394, + "step": 12105 + }, + { + "epoch": 4.468865370415949, + "grad_norm": 0.9470887184143066, + "learning_rate": 7.687108537288307e-07, + "loss": 0.5758, + "step": 12106 + }, + { + "epoch": 4.469234555746985, + "grad_norm": 0.9796518683433533, + "learning_rate": 7.683600142707364e-07, + "loss": 0.5924, + "step": 12107 + }, + { + "epoch": 4.469603741078021, + "grad_norm": 0.984751284122467, + "learning_rate": 7.680092403546691e-07, + "loss": 0.5476, + "step": 12108 + }, + { + "epoch": 4.469972926409057, + "grad_norm": 0.9109735488891602, + "learning_rate": 7.676585319939061e-07, + "loss": 0.5429, + "step": 12109 + }, + { + "epoch": 4.470342111740093, + "grad_norm": 0.9739021062850952, + "learning_rate": 7.673078892017213e-07, + "loss": 0.5544, + "step": 12110 + }, + { + "epoch": 4.47071129707113, + "grad_norm": 0.9552311897277832, + "learning_rate": 7.669573119913859e-07, + "loss": 0.5918, + "step": 12111 + }, + { + "epoch": 4.471080482402166, + "grad_norm": 0.9531885981559753, + "learning_rate": 7.666068003761684e-07, + "loss": 0.6121, + "step": 12112 + }, + { + "epoch": 4.471449667733202, + "grad_norm": 0.9379632472991943, + "learning_rate": 7.662563543693364e-07, + "loss": 0.5649, + "step": 12113 + }, + { + "epoch": 4.471818853064239, + "grad_norm": 0.9618393182754517, + "learning_rate": 7.659059739841559e-07, + "loss": 0.5589, + "step": 12114 + }, + { + "epoch": 4.472188038395275, + "grad_norm": 0.9491791725158691, + "learning_rate": 7.655556592338853e-07, + "loss": 0.5485, + "step": 12115 + }, + { + "epoch": 4.4725572237263105, + "grad_norm": 0.9639852046966553, + "learning_rate": 7.652054101317866e-07, + "loss": 0.5729, + "step": 12116 + }, + { + "epoch": 4.472926409057346, + "grad_norm": 0.9602736234664917, + "learning_rate": 7.648552266911147e-07, + "loss": 0.5706, + "step": 12117 + }, + { + "epoch": 4.473295594388383, + "grad_norm": 0.9208425879478455, + "learning_rate": 7.645051089251254e-07, + "loss": 0.5335, + "step": 12118 + }, + { + "epoch": 4.473664779719419, + "grad_norm": 0.9308244585990906, + "learning_rate": 7.641550568470704e-07, + "loss": 0.58, + "step": 12119 + }, + { + "epoch": 4.474033965050455, + "grad_norm": 1.0192749500274658, + "learning_rate": 7.638050704701977e-07, + "loss": 0.5678, + "step": 12120 + }, + { + "epoch": 4.474403150381492, + "grad_norm": 0.9382200241088867, + "learning_rate": 7.634551498077558e-07, + "loss": 0.5902, + "step": 12121 + }, + { + "epoch": 4.474772335712528, + "grad_norm": 0.985787570476532, + "learning_rate": 7.631052948729887e-07, + "loss": 0.595, + "step": 12122 + }, + { + "epoch": 4.475141521043564, + "grad_norm": 0.9942469596862793, + "learning_rate": 7.627555056791369e-07, + "loss": 0.5698, + "step": 12123 + }, + { + "epoch": 4.4755107063746, + "grad_norm": 0.9550743103027344, + "learning_rate": 7.62405782239442e-07, + "loss": 0.5745, + "step": 12124 + }, + { + "epoch": 4.475879891705636, + "grad_norm": 0.963222086429596, + "learning_rate": 7.620561245671387e-07, + "loss": 0.5939, + "step": 12125 + }, + { + "epoch": 4.476249077036672, + "grad_norm": 0.9331395030021667, + "learning_rate": 7.617065326754638e-07, + "loss": 0.5932, + "step": 12126 + }, + { + "epoch": 4.476618262367708, + "grad_norm": 0.9524618983268738, + "learning_rate": 7.613570065776479e-07, + "loss": 0.5845, + "step": 12127 + }, + { + "epoch": 4.476987447698745, + "grad_norm": 1.0848585367202759, + "learning_rate": 7.610075462869195e-07, + "loss": 0.5686, + "step": 12128 + }, + { + "epoch": 4.477356633029781, + "grad_norm": 0.9546926021575928, + "learning_rate": 7.606581518165077e-07, + "loss": 0.5959, + "step": 12129 + }, + { + "epoch": 4.477725818360817, + "grad_norm": 0.9212685227394104, + "learning_rate": 7.60308823179636e-07, + "loss": 0.5458, + "step": 12130 + }, + { + "epoch": 4.478095003691854, + "grad_norm": 0.9554281234741211, + "learning_rate": 7.599595603895251e-07, + "loss": 0.5961, + "step": 12131 + }, + { + "epoch": 4.47846418902289, + "grad_norm": 1.0319945812225342, + "learning_rate": 7.596103634593968e-07, + "loss": 0.5833, + "step": 12132 + }, + { + "epoch": 4.4788333743539255, + "grad_norm": 0.9400152564048767, + "learning_rate": 7.592612324024662e-07, + "loss": 0.6131, + "step": 12133 + }, + { + "epoch": 4.479202559684962, + "grad_norm": 1.0152634382247925, + "learning_rate": 7.5891216723195e-07, + "loss": 0.5743, + "step": 12134 + }, + { + "epoch": 4.479571745015998, + "grad_norm": 0.9611108899116516, + "learning_rate": 7.585631679610572e-07, + "loss": 0.5563, + "step": 12135 + }, + { + "epoch": 4.479940930347034, + "grad_norm": 0.9372648596763611, + "learning_rate": 7.582142346029991e-07, + "loss": 0.583, + "step": 12136 + }, + { + "epoch": 4.48031011567807, + "grad_norm": 0.9650997519493103, + "learning_rate": 7.578653671709832e-07, + "loss": 0.5828, + "step": 12137 + }, + { + "epoch": 4.480679301009107, + "grad_norm": 0.9947588443756104, + "learning_rate": 7.575165656782132e-07, + "loss": 0.5573, + "step": 12138 + }, + { + "epoch": 4.481048486340143, + "grad_norm": 0.9653211236000061, + "learning_rate": 7.571678301378913e-07, + "loss": 0.584, + "step": 12139 + }, + { + "epoch": 4.481417671671179, + "grad_norm": 0.9542648792266846, + "learning_rate": 7.568191605632161e-07, + "loss": 0.5476, + "step": 12140 + }, + { + "epoch": 4.4817868570022155, + "grad_norm": 0.95871901512146, + "learning_rate": 7.564705569673864e-07, + "loss": 0.5641, + "step": 12141 + }, + { + "epoch": 4.482156042333251, + "grad_norm": 0.9811316728591919, + "learning_rate": 7.56122019363596e-07, + "loss": 0.563, + "step": 12142 + }, + { + "epoch": 4.482525227664287, + "grad_norm": 0.9298844933509827, + "learning_rate": 7.557735477650357e-07, + "loss": 0.5549, + "step": 12143 + }, + { + "epoch": 4.482894412995323, + "grad_norm": 0.9618381261825562, + "learning_rate": 7.554251421848971e-07, + "loss": 0.5855, + "step": 12144 + }, + { + "epoch": 4.48326359832636, + "grad_norm": 0.9528921842575073, + "learning_rate": 7.550768026363656e-07, + "loss": 0.5518, + "step": 12145 + }, + { + "epoch": 4.483632783657396, + "grad_norm": 0.9533156156539917, + "learning_rate": 7.54728529132627e-07, + "loss": 0.6006, + "step": 12146 + }, + { + "epoch": 4.484001968988432, + "grad_norm": 0.9566360116004944, + "learning_rate": 7.543803216868628e-07, + "loss": 0.5641, + "step": 12147 + }, + { + "epoch": 4.484371154319469, + "grad_norm": 0.9561976194381714, + "learning_rate": 7.540321803122517e-07, + "loss": 0.556, + "step": 12148 + }, + { + "epoch": 4.484740339650505, + "grad_norm": 0.9265380501747131, + "learning_rate": 7.536841050219723e-07, + "loss": 0.5682, + "step": 12149 + }, + { + "epoch": 4.4851095249815405, + "grad_norm": 0.9715983271598816, + "learning_rate": 7.533360958291986e-07, + "loss": 0.556, + "step": 12150 + }, + { + "epoch": 4.485478710312577, + "grad_norm": 0.998029351234436, + "learning_rate": 7.529881527471014e-07, + "loss": 0.5615, + "step": 12151 + }, + { + "epoch": 4.485847895643613, + "grad_norm": 0.9438183307647705, + "learning_rate": 7.52640275788852e-07, + "loss": 0.5608, + "step": 12152 + }, + { + "epoch": 4.486217080974649, + "grad_norm": 0.9789996147155762, + "learning_rate": 7.522924649676161e-07, + "loss": 0.588, + "step": 12153 + }, + { + "epoch": 4.486586266305686, + "grad_norm": 0.9676276445388794, + "learning_rate": 7.519447202965602e-07, + "loss": 0.5584, + "step": 12154 + }, + { + "epoch": 4.486955451636722, + "grad_norm": 0.9895492792129517, + "learning_rate": 7.515970417888435e-07, + "loss": 0.5962, + "step": 12155 + }, + { + "epoch": 4.487324636967758, + "grad_norm": 0.9685246348381042, + "learning_rate": 7.51249429457627e-07, + "loss": 0.5696, + "step": 12156 + }, + { + "epoch": 4.487693822298794, + "grad_norm": 0.9356187582015991, + "learning_rate": 7.509018833160684e-07, + "loss": 0.5952, + "step": 12157 + }, + { + "epoch": 4.4880630076298305, + "grad_norm": 0.9478508234024048, + "learning_rate": 7.505544033773212e-07, + "loss": 0.5844, + "step": 12158 + }, + { + "epoch": 4.488432192960866, + "grad_norm": 1.0098007917404175, + "learning_rate": 7.502069896545375e-07, + "loss": 0.5868, + "step": 12159 + }, + { + "epoch": 4.488801378291902, + "grad_norm": 0.9368032217025757, + "learning_rate": 7.498596421608673e-07, + "loss": 0.6119, + "step": 12160 + }, + { + "epoch": 4.489170563622939, + "grad_norm": 0.9524245858192444, + "learning_rate": 7.495123609094576e-07, + "loss": 0.5407, + "step": 12161 + }, + { + "epoch": 4.489539748953975, + "grad_norm": 0.9197715520858765, + "learning_rate": 7.491651459134524e-07, + "loss": 0.5593, + "step": 12162 + }, + { + "epoch": 4.489908934285011, + "grad_norm": 1.0122522115707397, + "learning_rate": 7.488179971859933e-07, + "loss": 0.5638, + "step": 12163 + }, + { + "epoch": 4.490278119616047, + "grad_norm": 0.9605528116226196, + "learning_rate": 7.4847091474022e-07, + "loss": 0.5425, + "step": 12164 + }, + { + "epoch": 4.490647304947084, + "grad_norm": 0.9587163329124451, + "learning_rate": 7.481238985892708e-07, + "loss": 0.5685, + "step": 12165 + }, + { + "epoch": 4.4910164902781196, + "grad_norm": 0.9711236357688904, + "learning_rate": 7.477769487462791e-07, + "loss": 0.5585, + "step": 12166 + }, + { + "epoch": 4.4913856756091555, + "grad_norm": 0.9682841300964355, + "learning_rate": 7.474300652243768e-07, + "loss": 0.5773, + "step": 12167 + }, + { + "epoch": 4.491754860940192, + "grad_norm": 0.9507418870925903, + "learning_rate": 7.470832480366926e-07, + "loss": 0.5471, + "step": 12168 + }, + { + "epoch": 4.492124046271228, + "grad_norm": 0.9381231069564819, + "learning_rate": 7.467364971963551e-07, + "loss": 0.5386, + "step": 12169 + }, + { + "epoch": 4.492493231602264, + "grad_norm": 0.9511252641677856, + "learning_rate": 7.463898127164879e-07, + "loss": 0.5682, + "step": 12170 + }, + { + "epoch": 4.492862416933301, + "grad_norm": 0.9858061671257019, + "learning_rate": 7.460431946102118e-07, + "loss": 0.5554, + "step": 12171 + }, + { + "epoch": 4.493231602264337, + "grad_norm": 0.9587076902389526, + "learning_rate": 7.456966428906482e-07, + "loss": 0.5708, + "step": 12172 + }, + { + "epoch": 4.493600787595373, + "grad_norm": 0.9468465447425842, + "learning_rate": 7.453501575709124e-07, + "loss": 0.5768, + "step": 12173 + }, + { + "epoch": 4.493969972926409, + "grad_norm": 0.9306775331497192, + "learning_rate": 7.450037386641199e-07, + "loss": 0.5879, + "step": 12174 + }, + { + "epoch": 4.4943391582574455, + "grad_norm": 0.9505054354667664, + "learning_rate": 7.446573861833819e-07, + "loss": 0.5551, + "step": 12175 + }, + { + "epoch": 4.494708343588481, + "grad_norm": 0.936667799949646, + "learning_rate": 7.443111001418074e-07, + "loss": 0.5756, + "step": 12176 + }, + { + "epoch": 4.495077528919517, + "grad_norm": 0.9557254314422607, + "learning_rate": 7.43964880552504e-07, + "loss": 0.5692, + "step": 12177 + }, + { + "epoch": 4.495446714250554, + "grad_norm": 0.9884572625160217, + "learning_rate": 7.436187274285761e-07, + "loss": 0.5935, + "step": 12178 + }, + { + "epoch": 4.49581589958159, + "grad_norm": 0.9570366144180298, + "learning_rate": 7.432726407831239e-07, + "loss": 0.5527, + "step": 12179 + }, + { + "epoch": 4.496185084912626, + "grad_norm": 0.9615358114242554, + "learning_rate": 7.42926620629249e-07, + "loss": 0.5596, + "step": 12180 + }, + { + "epoch": 4.496554270243663, + "grad_norm": 0.9613714814186096, + "learning_rate": 7.425806669800467e-07, + "loss": 0.577, + "step": 12181 + }, + { + "epoch": 4.496923455574699, + "grad_norm": 0.9838926792144775, + "learning_rate": 7.422347798486115e-07, + "loss": 0.5644, + "step": 12182 + }, + { + "epoch": 4.4972926409057346, + "grad_norm": 0.9528496265411377, + "learning_rate": 7.418889592480343e-07, + "loss": 0.5852, + "step": 12183 + }, + { + "epoch": 4.4976618262367705, + "grad_norm": 0.9610999226570129, + "learning_rate": 7.415432051914053e-07, + "loss": 0.5613, + "step": 12184 + }, + { + "epoch": 4.498031011567807, + "grad_norm": 0.9029250741004944, + "learning_rate": 7.411975176918118e-07, + "loss": 0.5811, + "step": 12185 + }, + { + "epoch": 4.498400196898843, + "grad_norm": 0.9844378232955933, + "learning_rate": 7.408518967623368e-07, + "loss": 0.5519, + "step": 12186 + }, + { + "epoch": 4.498769382229879, + "grad_norm": 0.9512313604354858, + "learning_rate": 7.405063424160628e-07, + "loss": 0.5741, + "step": 12187 + }, + { + "epoch": 4.499138567560916, + "grad_norm": 0.943385660648346, + "learning_rate": 7.401608546660674e-07, + "loss": 0.6046, + "step": 12188 + }, + { + "epoch": 4.499507752891952, + "grad_norm": 0.9336569905281067, + "learning_rate": 7.39815433525429e-07, + "loss": 0.5899, + "step": 12189 + }, + { + "epoch": 4.499876938222988, + "grad_norm": 0.9708227515220642, + "learning_rate": 7.394700790072209e-07, + "loss": 0.5505, + "step": 12190 + }, + { + "epoch": 4.5002461235540245, + "grad_norm": 0.9597625136375427, + "learning_rate": 7.391247911245139e-07, + "loss": 0.5782, + "step": 12191 + }, + { + "epoch": 4.5006153088850605, + "grad_norm": 0.9492368102073669, + "learning_rate": 7.387795698903788e-07, + "loss": 0.5821, + "step": 12192 + }, + { + "epoch": 4.500984494216096, + "grad_norm": 0.9708461165428162, + "learning_rate": 7.384344153178801e-07, + "loss": 0.5879, + "step": 12193 + }, + { + "epoch": 4.501353679547132, + "grad_norm": 0.9757567644119263, + "learning_rate": 7.380893274200837e-07, + "loss": 0.5523, + "step": 12194 + }, + { + "epoch": 4.501722864878169, + "grad_norm": 0.9351304173469543, + "learning_rate": 7.377443062100503e-07, + "loss": 0.5778, + "step": 12195 + }, + { + "epoch": 4.502092050209205, + "grad_norm": 0.9521865248680115, + "learning_rate": 7.373993517008379e-07, + "loss": 0.5594, + "step": 12196 + }, + { + "epoch": 4.502461235540241, + "grad_norm": 0.9460328221321106, + "learning_rate": 7.370544639055044e-07, + "loss": 0.5625, + "step": 12197 + }, + { + "epoch": 4.502830420871278, + "grad_norm": 0.9364818334579468, + "learning_rate": 7.367096428371032e-07, + "loss": 0.6014, + "step": 12198 + }, + { + "epoch": 4.503199606202314, + "grad_norm": 0.9804808497428894, + "learning_rate": 7.363648885086846e-07, + "loss": 0.544, + "step": 12199 + }, + { + "epoch": 4.5035687915333495, + "grad_norm": 0.9334636926651001, + "learning_rate": 7.360202009332993e-07, + "loss": 0.5705, + "step": 12200 + }, + { + "epoch": 4.5039379768643855, + "grad_norm": 0.9597759246826172, + "learning_rate": 7.356755801239926e-07, + "loss": 0.5559, + "step": 12201 + }, + { + "epoch": 4.504307162195422, + "grad_norm": 0.952621579170227, + "learning_rate": 7.353310260938076e-07, + "loss": 0.5576, + "step": 12202 + }, + { + "epoch": 4.504676347526458, + "grad_norm": 0.9670845866203308, + "learning_rate": 7.34986538855787e-07, + "loss": 0.5931, + "step": 12203 + }, + { + "epoch": 4.505045532857494, + "grad_norm": 0.9714767932891846, + "learning_rate": 7.346421184229683e-07, + "loss": 0.565, + "step": 12204 + }, + { + "epoch": 4.505414718188531, + "grad_norm": 0.9388136267662048, + "learning_rate": 7.342977648083888e-07, + "loss": 0.59, + "step": 12205 + }, + { + "epoch": 4.505783903519567, + "grad_norm": 0.9942989945411682, + "learning_rate": 7.339534780250818e-07, + "loss": 0.6071, + "step": 12206 + }, + { + "epoch": 4.506153088850603, + "grad_norm": 0.9769531488418579, + "learning_rate": 7.336092580860774e-07, + "loss": 0.5851, + "step": 12207 + }, + { + "epoch": 4.5065222741816395, + "grad_norm": 0.9396677613258362, + "learning_rate": 7.332651050044059e-07, + "loss": 0.5673, + "step": 12208 + }, + { + "epoch": 4.5068914595126754, + "grad_norm": 0.957573413848877, + "learning_rate": 7.329210187930927e-07, + "loss": 0.5892, + "step": 12209 + }, + { + "epoch": 4.507260644843711, + "grad_norm": 0.9689034819602966, + "learning_rate": 7.32576999465161e-07, + "loss": 0.5886, + "step": 12210 + }, + { + "epoch": 4.507629830174748, + "grad_norm": 0.9690962433815002, + "learning_rate": 7.322330470336314e-07, + "loss": 0.5672, + "step": 12211 + }, + { + "epoch": 4.507999015505784, + "grad_norm": 0.9822025895118713, + "learning_rate": 7.318891615115231e-07, + "loss": 0.5649, + "step": 12212 + }, + { + "epoch": 4.50836820083682, + "grad_norm": 0.9319487810134888, + "learning_rate": 7.315453429118527e-07, + "loss": 0.5378, + "step": 12213 + }, + { + "epoch": 4.508737386167856, + "grad_norm": 0.963357150554657, + "learning_rate": 7.312015912476331e-07, + "loss": 0.564, + "step": 12214 + }, + { + "epoch": 4.509106571498893, + "grad_norm": 0.9598106145858765, + "learning_rate": 7.308579065318749e-07, + "loss": 0.5671, + "step": 12215 + }, + { + "epoch": 4.509475756829929, + "grad_norm": 0.9632495641708374, + "learning_rate": 7.30514288777586e-07, + "loss": 0.5562, + "step": 12216 + }, + { + "epoch": 4.5098449421609645, + "grad_norm": 0.9595595598220825, + "learning_rate": 7.301707379977732e-07, + "loss": 0.5855, + "step": 12217 + }, + { + "epoch": 4.510214127492001, + "grad_norm": 1.0222808122634888, + "learning_rate": 7.298272542054396e-07, + "loss": 0.5886, + "step": 12218 + }, + { + "epoch": 4.510583312823037, + "grad_norm": 0.9594646096229553, + "learning_rate": 7.294838374135851e-07, + "loss": 0.5759, + "step": 12219 + }, + { + "epoch": 4.510952498154073, + "grad_norm": 0.9406309723854065, + "learning_rate": 7.291404876352093e-07, + "loss": 0.5711, + "step": 12220 + }, + { + "epoch": 4.511321683485109, + "grad_norm": 0.966749370098114, + "learning_rate": 7.287972048833064e-07, + "loss": 0.5724, + "step": 12221 + }, + { + "epoch": 4.511690868816146, + "grad_norm": 0.9474830031394958, + "learning_rate": 7.284539891708709e-07, + "loss": 0.5907, + "step": 12222 + }, + { + "epoch": 4.512060054147182, + "grad_norm": 0.9561360478401184, + "learning_rate": 7.28110840510893e-07, + "loss": 0.5942, + "step": 12223 + }, + { + "epoch": 4.512429239478218, + "grad_norm": 0.9693170189857483, + "learning_rate": 7.277677589163595e-07, + "loss": 0.5795, + "step": 12224 + }, + { + "epoch": 4.5127984248092545, + "grad_norm": 0.9712854623794556, + "learning_rate": 7.27424744400258e-07, + "loss": 0.6008, + "step": 12225 + }, + { + "epoch": 4.51316761014029, + "grad_norm": 0.9881293177604675, + "learning_rate": 7.270817969755705e-07, + "loss": 0.5798, + "step": 12226 + }, + { + "epoch": 4.513536795471326, + "grad_norm": 0.9538909792900085, + "learning_rate": 7.267389166552769e-07, + "loss": 0.5824, + "step": 12227 + }, + { + "epoch": 4.513905980802363, + "grad_norm": 0.9383710026741028, + "learning_rate": 7.263961034523559e-07, + "loss": 0.5949, + "step": 12228 + }, + { + "epoch": 4.514275166133399, + "grad_norm": 0.9528449177742004, + "learning_rate": 7.260533573797829e-07, + "loss": 0.585, + "step": 12229 + }, + { + "epoch": 4.514644351464435, + "grad_norm": 0.9622063040733337, + "learning_rate": 7.257106784505297e-07, + "loss": 0.5611, + "step": 12230 + }, + { + "epoch": 4.515013536795472, + "grad_norm": 0.9646047353744507, + "learning_rate": 7.253680666775681e-07, + "loss": 0.5824, + "step": 12231 + }, + { + "epoch": 4.515382722126508, + "grad_norm": 0.9385722279548645, + "learning_rate": 7.250255220738644e-07, + "loss": 0.5613, + "step": 12232 + }, + { + "epoch": 4.515751907457544, + "grad_norm": 0.950657069683075, + "learning_rate": 7.246830446523853e-07, + "loss": 0.5734, + "step": 12233 + }, + { + "epoch": 4.5161210927885795, + "grad_norm": 0.9764195680618286, + "learning_rate": 7.243406344260925e-07, + "loss": 0.5647, + "step": 12234 + }, + { + "epoch": 4.516490278119616, + "grad_norm": 0.9589142203330994, + "learning_rate": 7.239982914079455e-07, + "loss": 0.6038, + "step": 12235 + }, + { + "epoch": 4.516859463450652, + "grad_norm": 0.9747604131698608, + "learning_rate": 7.236560156109035e-07, + "loss": 0.5726, + "step": 12236 + }, + { + "epoch": 4.517228648781688, + "grad_norm": 1.0091286897659302, + "learning_rate": 7.233138070479206e-07, + "loss": 0.5835, + "step": 12237 + }, + { + "epoch": 4.517597834112725, + "grad_norm": 0.9551447629928589, + "learning_rate": 7.229716657319494e-07, + "loss": 0.55, + "step": 12238 + }, + { + "epoch": 4.517967019443761, + "grad_norm": 0.9514274001121521, + "learning_rate": 7.22629591675939e-07, + "loss": 0.5468, + "step": 12239 + }, + { + "epoch": 4.518336204774797, + "grad_norm": 0.9645543694496155, + "learning_rate": 7.222875848928379e-07, + "loss": 0.5948, + "step": 12240 + }, + { + "epoch": 4.518705390105833, + "grad_norm": 0.9626505970954895, + "learning_rate": 7.219456453955912e-07, + "loss": 0.588, + "step": 12241 + }, + { + "epoch": 4.5190745754368695, + "grad_norm": 0.9834396243095398, + "learning_rate": 7.216037731971406e-07, + "loss": 0.5914, + "step": 12242 + }, + { + "epoch": 4.519443760767905, + "grad_norm": 0.9703446626663208, + "learning_rate": 7.21261968310426e-07, + "loss": 0.5574, + "step": 12243 + }, + { + "epoch": 4.519812946098941, + "grad_norm": 0.9675273895263672, + "learning_rate": 7.20920230748384e-07, + "loss": 0.6021, + "step": 12244 + }, + { + "epoch": 4.520182131429978, + "grad_norm": 0.967186689376831, + "learning_rate": 7.205785605239501e-07, + "loss": 0.5409, + "step": 12245 + }, + { + "epoch": 4.520551316761014, + "grad_norm": 0.9747320413589478, + "learning_rate": 7.202369576500562e-07, + "loss": 0.6, + "step": 12246 + }, + { + "epoch": 4.52092050209205, + "grad_norm": 0.9379962086677551, + "learning_rate": 7.198954221396312e-07, + "loss": 0.5381, + "step": 12247 + }, + { + "epoch": 4.521289687423087, + "grad_norm": 0.9190950393676758, + "learning_rate": 7.195539540056032e-07, + "loss": 0.5844, + "step": 12248 + }, + { + "epoch": 4.521658872754123, + "grad_norm": 0.9839637875556946, + "learning_rate": 7.192125532608962e-07, + "loss": 0.5922, + "step": 12249 + }, + { + "epoch": 4.522028058085159, + "grad_norm": 0.9515743255615234, + "learning_rate": 7.188712199184314e-07, + "loss": 0.5681, + "step": 12250 + }, + { + "epoch": 4.522397243416195, + "grad_norm": 0.9648337364196777, + "learning_rate": 7.185299539911295e-07, + "loss": 0.5728, + "step": 12251 + }, + { + "epoch": 4.522766428747231, + "grad_norm": 0.9614214301109314, + "learning_rate": 7.181887554919056e-07, + "loss": 0.5599, + "step": 12252 + }, + { + "epoch": 4.523135614078267, + "grad_norm": 0.9839128255844116, + "learning_rate": 7.17847624433676e-07, + "loss": 0.5822, + "step": 12253 + }, + { + "epoch": 4.523504799409303, + "grad_norm": 0.9724512100219727, + "learning_rate": 7.175065608293513e-07, + "loss": 0.5753, + "step": 12254 + }, + { + "epoch": 4.52387398474034, + "grad_norm": 0.9749932289123535, + "learning_rate": 7.171655646918399e-07, + "loss": 0.5741, + "step": 12255 + }, + { + "epoch": 4.524243170071376, + "grad_norm": 0.963465690612793, + "learning_rate": 7.168246360340501e-07, + "loss": 0.5905, + "step": 12256 + }, + { + "epoch": 4.524612355402412, + "grad_norm": 0.9654296636581421, + "learning_rate": 7.16483774868885e-07, + "loss": 0.5759, + "step": 12257 + }, + { + "epoch": 4.524981540733448, + "grad_norm": 0.9822253584861755, + "learning_rate": 7.16142981209246e-07, + "loss": 0.5544, + "step": 12258 + }, + { + "epoch": 4.5253507260644845, + "grad_norm": 0.9716669917106628, + "learning_rate": 7.158022550680313e-07, + "loss": 0.5601, + "step": 12259 + }, + { + "epoch": 4.52571991139552, + "grad_norm": 1.1417088508605957, + "learning_rate": 7.154615964581385e-07, + "loss": 0.5693, + "step": 12260 + }, + { + "epoch": 4.526089096726556, + "grad_norm": 0.9350441098213196, + "learning_rate": 7.151210053924615e-07, + "loss": 0.5092, + "step": 12261 + }, + { + "epoch": 4.526458282057593, + "grad_norm": 0.9911386966705322, + "learning_rate": 7.147804818838913e-07, + "loss": 0.5798, + "step": 12262 + }, + { + "epoch": 4.526827467388629, + "grad_norm": 1.0134905576705933, + "learning_rate": 7.144400259453163e-07, + "loss": 0.5729, + "step": 12263 + }, + { + "epoch": 4.527196652719665, + "grad_norm": 0.9741863012313843, + "learning_rate": 7.140996375896222e-07, + "loss": 0.5346, + "step": 12264 + }, + { + "epoch": 4.527565838050702, + "grad_norm": 0.9437469244003296, + "learning_rate": 7.137593168296938e-07, + "loss": 0.5662, + "step": 12265 + }, + { + "epoch": 4.527935023381738, + "grad_norm": 0.9622087478637695, + "learning_rate": 7.134190636784116e-07, + "loss": 0.5604, + "step": 12266 + }, + { + "epoch": 4.528304208712774, + "grad_norm": 0.94486004114151, + "learning_rate": 7.130788781486534e-07, + "loss": 0.5863, + "step": 12267 + }, + { + "epoch": 4.52867339404381, + "grad_norm": 0.9610571265220642, + "learning_rate": 7.127387602532953e-07, + "loss": 0.5824, + "step": 12268 + }, + { + "epoch": 4.529042579374846, + "grad_norm": 1.0051312446594238, + "learning_rate": 7.123987100052129e-07, + "loss": 0.587, + "step": 12269 + }, + { + "epoch": 4.529411764705882, + "grad_norm": 0.9809900522232056, + "learning_rate": 7.120587274172735e-07, + "loss": 0.5839, + "step": 12270 + }, + { + "epoch": 4.529780950036918, + "grad_norm": 0.955378532409668, + "learning_rate": 7.117188125023478e-07, + "loss": 0.5811, + "step": 12271 + }, + { + "epoch": 4.530150135367955, + "grad_norm": 0.9277257919311523, + "learning_rate": 7.113789652733e-07, + "loss": 0.5571, + "step": 12272 + }, + { + "epoch": 4.530519320698991, + "grad_norm": 0.963740885257721, + "learning_rate": 7.110391857429943e-07, + "loss": 0.5734, + "step": 12273 + }, + { + "epoch": 4.530888506030027, + "grad_norm": 0.9289120435714722, + "learning_rate": 7.106994739242912e-07, + "loss": 0.5361, + "step": 12274 + }, + { + "epoch": 4.531257691361064, + "grad_norm": 0.9231817126274109, + "learning_rate": 7.103598298300471e-07, + "loss": 0.5781, + "step": 12275 + }, + { + "epoch": 4.5316268766920995, + "grad_norm": 0.976919412612915, + "learning_rate": 7.100202534731196e-07, + "loss": 0.5165, + "step": 12276 + }, + { + "epoch": 4.531996062023135, + "grad_norm": 0.896682858467102, + "learning_rate": 7.096807448663606e-07, + "loss": 0.5573, + "step": 12277 + }, + { + "epoch": 4.532365247354171, + "grad_norm": 0.9341700077056885, + "learning_rate": 7.093413040226196e-07, + "loss": 0.5609, + "step": 12278 + }, + { + "epoch": 4.532734432685208, + "grad_norm": 0.9523647427558899, + "learning_rate": 7.09001930954746e-07, + "loss": 0.5811, + "step": 12279 + }, + { + "epoch": 4.533103618016244, + "grad_norm": 0.9544100761413574, + "learning_rate": 7.086626256755832e-07, + "loss": 0.5897, + "step": 12280 + }, + { + "epoch": 4.53347280334728, + "grad_norm": 0.97161865234375, + "learning_rate": 7.083233881979754e-07, + "loss": 0.5337, + "step": 12281 + }, + { + "epoch": 4.533841988678317, + "grad_norm": 0.9109223484992981, + "learning_rate": 7.079842185347618e-07, + "loss": 0.6032, + "step": 12282 + }, + { + "epoch": 4.534211174009353, + "grad_norm": 0.9867826104164124, + "learning_rate": 7.076451166987794e-07, + "loss": 0.5808, + "step": 12283 + }, + { + "epoch": 4.534580359340389, + "grad_norm": 0.9878796339035034, + "learning_rate": 7.073060827028644e-07, + "loss": 0.5941, + "step": 12284 + }, + { + "epoch": 4.534949544671425, + "grad_norm": 0.9644319415092468, + "learning_rate": 7.069671165598482e-07, + "loss": 0.5736, + "step": 12285 + }, + { + "epoch": 4.535318730002461, + "grad_norm": 0.9399902820587158, + "learning_rate": 7.06628218282561e-07, + "loss": 0.5351, + "step": 12286 + }, + { + "epoch": 4.535687915333497, + "grad_norm": 0.9620053768157959, + "learning_rate": 7.062893878838286e-07, + "loss": 0.5568, + "step": 12287 + }, + { + "epoch": 4.536057100664534, + "grad_norm": 0.9786978363990784, + "learning_rate": 7.059506253764773e-07, + "loss": 0.5783, + "step": 12288 + }, + { + "epoch": 4.53642628599557, + "grad_norm": 1.0072022676467896, + "learning_rate": 7.056119307733297e-07, + "loss": 0.582, + "step": 12289 + }, + { + "epoch": 4.536795471326606, + "grad_norm": 0.9900904297828674, + "learning_rate": 7.052733040872029e-07, + "loss": 0.5718, + "step": 12290 + }, + { + "epoch": 4.537164656657642, + "grad_norm": 0.9893781542778015, + "learning_rate": 7.049347453309158e-07, + "loss": 0.5817, + "step": 12291 + }, + { + "epoch": 4.537533841988679, + "grad_norm": 0.9451786875724792, + "learning_rate": 7.045962545172813e-07, + "loss": 0.5687, + "step": 12292 + }, + { + "epoch": 4.5379030273197145, + "grad_norm": 0.9564605355262756, + "learning_rate": 7.042578316591125e-07, + "loss": 0.586, + "step": 12293 + }, + { + "epoch": 4.53827221265075, + "grad_norm": 0.9290446043014526, + "learning_rate": 7.039194767692184e-07, + "loss": 0.5768, + "step": 12294 + }, + { + "epoch": 4.538641397981787, + "grad_norm": 0.9965382814407349, + "learning_rate": 7.035811898604042e-07, + "loss": 0.5782, + "step": 12295 + }, + { + "epoch": 4.539010583312823, + "grad_norm": 0.969377338886261, + "learning_rate": 7.032429709454758e-07, + "loss": 0.5776, + "step": 12296 + }, + { + "epoch": 4.539379768643859, + "grad_norm": 0.9813526272773743, + "learning_rate": 7.029048200372341e-07, + "loss": 0.6124, + "step": 12297 + }, + { + "epoch": 4.539748953974895, + "grad_norm": 0.9598655700683594, + "learning_rate": 7.025667371484768e-07, + "loss": 0.5858, + "step": 12298 + }, + { + "epoch": 4.540118139305932, + "grad_norm": 0.9410571455955505, + "learning_rate": 7.022287222920021e-07, + "loss": 0.5532, + "step": 12299 + }, + { + "epoch": 4.540487324636968, + "grad_norm": 1.0005472898483276, + "learning_rate": 7.01890775480602e-07, + "loss": 0.5365, + "step": 12300 + }, + { + "epoch": 4.540856509968004, + "grad_norm": 0.9483343362808228, + "learning_rate": 7.015528967270696e-07, + "loss": 0.5436, + "step": 12301 + }, + { + "epoch": 4.54122569529904, + "grad_norm": 0.9306362271308899, + "learning_rate": 7.012150860441922e-07, + "loss": 0.5514, + "step": 12302 + }, + { + "epoch": 4.541594880630076, + "grad_norm": 0.9423992037773132, + "learning_rate": 7.008773434447552e-07, + "loss": 0.5698, + "step": 12303 + }, + { + "epoch": 4.541964065961112, + "grad_norm": 0.9751172661781311, + "learning_rate": 7.005396689415439e-07, + "loss": 0.5865, + "step": 12304 + }, + { + "epoch": 4.542333251292149, + "grad_norm": 0.9280209541320801, + "learning_rate": 7.002020625473382e-07, + "loss": 0.4948, + "step": 12305 + }, + { + "epoch": 4.542702436623185, + "grad_norm": 0.9184144735336304, + "learning_rate": 6.998645242749158e-07, + "loss": 0.5532, + "step": 12306 + }, + { + "epoch": 4.543071621954221, + "grad_norm": 0.9241465330123901, + "learning_rate": 6.995270541370536e-07, + "loss": 0.5359, + "step": 12307 + }, + { + "epoch": 4.543440807285258, + "grad_norm": 0.9499655365943909, + "learning_rate": 6.991896521465235e-07, + "loss": 0.5677, + "step": 12308 + }, + { + "epoch": 4.543809992616294, + "grad_norm": 0.9380626082420349, + "learning_rate": 6.988523183160981e-07, + "loss": 0.577, + "step": 12309 + }, + { + "epoch": 4.5441791779473295, + "grad_norm": 0.9955887198448181, + "learning_rate": 6.985150526585427e-07, + "loss": 0.5745, + "step": 12310 + }, + { + "epoch": 4.544548363278365, + "grad_norm": 0.9618325233459473, + "learning_rate": 6.981778551866237e-07, + "loss": 0.6094, + "step": 12311 + }, + { + "epoch": 4.544917548609402, + "grad_norm": 1.0025354623794556, + "learning_rate": 6.978407259131051e-07, + "loss": 0.5578, + "step": 12312 + }, + { + "epoch": 4.545286733940438, + "grad_norm": 0.9551016688346863, + "learning_rate": 6.975036648507463e-07, + "loss": 0.5387, + "step": 12313 + }, + { + "epoch": 4.545655919271474, + "grad_norm": 0.9315856099128723, + "learning_rate": 6.971666720123049e-07, + "loss": 0.5423, + "step": 12314 + }, + { + "epoch": 4.54602510460251, + "grad_norm": 0.9382394552230835, + "learning_rate": 6.968297474105353e-07, + "loss": 0.5734, + "step": 12315 + }, + { + "epoch": 4.546394289933547, + "grad_norm": 0.9600075483322144, + "learning_rate": 6.964928910581906e-07, + "loss": 0.6386, + "step": 12316 + }, + { + "epoch": 4.546763475264583, + "grad_norm": 0.9804679155349731, + "learning_rate": 6.961561029680222e-07, + "loss": 0.6143, + "step": 12317 + }, + { + "epoch": 4.547132660595619, + "grad_norm": 0.9955132007598877, + "learning_rate": 6.958193831527745e-07, + "loss": 0.599, + "step": 12318 + }, + { + "epoch": 4.547501845926655, + "grad_norm": 0.9498536586761475, + "learning_rate": 6.954827316251947e-07, + "loss": 0.5875, + "step": 12319 + }, + { + "epoch": 4.547871031257691, + "grad_norm": 0.9578165411949158, + "learning_rate": 6.951461483980231e-07, + "loss": 0.548, + "step": 12320 + }, + { + "epoch": 4.548240216588727, + "grad_norm": 0.9544340372085571, + "learning_rate": 6.94809633484001e-07, + "loss": 0.6099, + "step": 12321 + }, + { + "epoch": 4.548609401919764, + "grad_norm": 1.0189759731292725, + "learning_rate": 6.944731868958643e-07, + "loss": 0.5725, + "step": 12322 + }, + { + "epoch": 4.5489785872508, + "grad_norm": 0.9797841906547546, + "learning_rate": 6.941368086463471e-07, + "loss": 0.5833, + "step": 12323 + }, + { + "epoch": 4.549347772581836, + "grad_norm": 0.9607504606246948, + "learning_rate": 6.938004987481825e-07, + "loss": 0.5408, + "step": 12324 + }, + { + "epoch": 4.549716957912873, + "grad_norm": 0.9557361006736755, + "learning_rate": 6.93464257214099e-07, + "loss": 0.5709, + "step": 12325 + }, + { + "epoch": 4.550086143243909, + "grad_norm": 0.9693350791931152, + "learning_rate": 6.931280840568225e-07, + "loss": 0.5784, + "step": 12326 + }, + { + "epoch": 4.5504553285749445, + "grad_norm": 1.0082348585128784, + "learning_rate": 6.927919792890783e-07, + "loss": 0.5805, + "step": 12327 + }, + { + "epoch": 4.55082451390598, + "grad_norm": 0.9709898233413696, + "learning_rate": 6.924559429235866e-07, + "loss": 0.5616, + "step": 12328 + }, + { + "epoch": 4.551193699237017, + "grad_norm": 0.9528722167015076, + "learning_rate": 6.921199749730686e-07, + "loss": 0.5424, + "step": 12329 + }, + { + "epoch": 4.551562884568053, + "grad_norm": 0.9678505659103394, + "learning_rate": 6.917840754502375e-07, + "loss": 0.568, + "step": 12330 + }, + { + "epoch": 4.551932069899089, + "grad_norm": 0.9594281911849976, + "learning_rate": 6.914482443678083e-07, + "loss": 0.558, + "step": 12331 + }, + { + "epoch": 4.552301255230126, + "grad_norm": 0.9650565385818481, + "learning_rate": 6.911124817384932e-07, + "loss": 0.61, + "step": 12332 + }, + { + "epoch": 4.552670440561162, + "grad_norm": 0.9348960518836975, + "learning_rate": 6.907767875749996e-07, + "loss": 0.528, + "step": 12333 + }, + { + "epoch": 4.553039625892198, + "grad_norm": 0.9459391832351685, + "learning_rate": 6.904411618900339e-07, + "loss": 0.5848, + "step": 12334 + }, + { + "epoch": 4.553408811223234, + "grad_norm": 0.9782647490501404, + "learning_rate": 6.901056046962981e-07, + "loss": 0.5723, + "step": 12335 + }, + { + "epoch": 4.55377799655427, + "grad_norm": 1.1605944633483887, + "learning_rate": 6.897701160064941e-07, + "loss": 0.5798, + "step": 12336 + }, + { + "epoch": 4.554147181885306, + "grad_norm": 1.0015854835510254, + "learning_rate": 6.894346958333212e-07, + "loss": 0.5703, + "step": 12337 + }, + { + "epoch": 4.554516367216342, + "grad_norm": 0.932277500629425, + "learning_rate": 6.890993441894722e-07, + "loss": 0.5698, + "step": 12338 + }, + { + "epoch": 4.554885552547379, + "grad_norm": 0.9608826637268066, + "learning_rate": 6.887640610876422e-07, + "loss": 0.5822, + "step": 12339 + }, + { + "epoch": 4.555254737878415, + "grad_norm": 1.0000303983688354, + "learning_rate": 6.884288465405203e-07, + "loss": 0.5747, + "step": 12340 + }, + { + "epoch": 4.555623923209451, + "grad_norm": 0.9571161866188049, + "learning_rate": 6.880937005607955e-07, + "loss": 0.5745, + "step": 12341 + }, + { + "epoch": 4.555993108540488, + "grad_norm": 0.9771960973739624, + "learning_rate": 6.877586231611522e-07, + "loss": 0.5641, + "step": 12342 + }, + { + "epoch": 4.556362293871524, + "grad_norm": 0.9117264151573181, + "learning_rate": 6.874236143542722e-07, + "loss": 0.5759, + "step": 12343 + }, + { + "epoch": 4.5567314792025595, + "grad_norm": 0.945955216884613, + "learning_rate": 6.870886741528366e-07, + "loss": 0.5734, + "step": 12344 + }, + { + "epoch": 4.557100664533596, + "grad_norm": 0.9928367733955383, + "learning_rate": 6.867538025695239e-07, + "loss": 0.5718, + "step": 12345 + }, + { + "epoch": 4.557469849864632, + "grad_norm": 0.964819073677063, + "learning_rate": 6.864189996170059e-07, + "loss": 0.5422, + "step": 12346 + }, + { + "epoch": 4.557839035195668, + "grad_norm": 0.9640419483184814, + "learning_rate": 6.860842653079574e-07, + "loss": 0.6061, + "step": 12347 + }, + { + "epoch": 4.558208220526704, + "grad_norm": 0.9638115763664246, + "learning_rate": 6.857495996550461e-07, + "loss": 0.5878, + "step": 12348 + }, + { + "epoch": 4.558577405857741, + "grad_norm": 0.9910593032836914, + "learning_rate": 6.854150026709405e-07, + "loss": 0.5908, + "step": 12349 + }, + { + "epoch": 4.558946591188777, + "grad_norm": 0.9506745934486389, + "learning_rate": 6.850804743683046e-07, + "loss": 0.5885, + "step": 12350 + }, + { + "epoch": 4.559315776519813, + "grad_norm": 0.9863948822021484, + "learning_rate": 6.84746014759799e-07, + "loss": 0.5868, + "step": 12351 + }, + { + "epoch": 4.5596849618508495, + "grad_norm": 0.9718477129936218, + "learning_rate": 6.844116238580847e-07, + "loss": 0.5733, + "step": 12352 + }, + { + "epoch": 4.560054147181885, + "grad_norm": 0.9866672158241272, + "learning_rate": 6.840773016758173e-07, + "loss": 0.5542, + "step": 12353 + }, + { + "epoch": 4.560423332512921, + "grad_norm": 0.963489294052124, + "learning_rate": 6.837430482256502e-07, + "loss": 0.5549, + "step": 12354 + }, + { + "epoch": 4.560792517843957, + "grad_norm": 0.9168636202812195, + "learning_rate": 6.834088635202363e-07, + "loss": 0.5547, + "step": 12355 + }, + { + "epoch": 4.561161703174994, + "grad_norm": 0.9354421496391296, + "learning_rate": 6.830747475722227e-07, + "loss": 0.588, + "step": 12356 + }, + { + "epoch": 4.56153088850603, + "grad_norm": 0.9293963313102722, + "learning_rate": 6.827407003942582e-07, + "loss": 0.5402, + "step": 12357 + }, + { + "epoch": 4.561900073837066, + "grad_norm": 0.9681414365768433, + "learning_rate": 6.82406721998983e-07, + "loss": 0.5719, + "step": 12358 + }, + { + "epoch": 4.562269259168103, + "grad_norm": 0.962490439414978, + "learning_rate": 6.8207281239904e-07, + "loss": 0.5334, + "step": 12359 + }, + { + "epoch": 4.562638444499139, + "grad_norm": 0.9489598274230957, + "learning_rate": 6.817389716070677e-07, + "loss": 0.5938, + "step": 12360 + }, + { + "epoch": 4.5630076298301745, + "grad_norm": 0.9539902210235596, + "learning_rate": 6.814051996357016e-07, + "loss": 0.561, + "step": 12361 + }, + { + "epoch": 4.563376815161211, + "grad_norm": 0.9697971343994141, + "learning_rate": 6.810714964975749e-07, + "loss": 0.6124, + "step": 12362 + }, + { + "epoch": 4.563746000492247, + "grad_norm": 0.9713518023490906, + "learning_rate": 6.807378622053171e-07, + "loss": 0.5799, + "step": 12363 + }, + { + "epoch": 4.564115185823283, + "grad_norm": 0.9782620072364807, + "learning_rate": 6.804042967715568e-07, + "loss": 0.5706, + "step": 12364 + }, + { + "epoch": 4.56448437115432, + "grad_norm": 0.9317705035209656, + "learning_rate": 6.800708002089215e-07, + "loss": 0.6186, + "step": 12365 + }, + { + "epoch": 4.564853556485356, + "grad_norm": 0.9622471928596497, + "learning_rate": 6.797373725300305e-07, + "loss": 0.5817, + "step": 12366 + }, + { + "epoch": 4.565222741816392, + "grad_norm": 0.9856782555580139, + "learning_rate": 6.794040137475061e-07, + "loss": 0.5944, + "step": 12367 + }, + { + "epoch": 4.565591927147428, + "grad_norm": 0.9560170769691467, + "learning_rate": 6.790707238739643e-07, + "loss": 0.5952, + "step": 12368 + }, + { + "epoch": 4.5659611124784645, + "grad_norm": 0.9421547055244446, + "learning_rate": 6.78737502922022e-07, + "loss": 0.57, + "step": 12369 + }, + { + "epoch": 4.5663302978095, + "grad_norm": 0.949842631816864, + "learning_rate": 6.784043509042901e-07, + "loss": 0.5884, + "step": 12370 + }, + { + "epoch": 4.566699483140536, + "grad_norm": 0.9928678870201111, + "learning_rate": 6.780712678333778e-07, + "loss": 0.57, + "step": 12371 + }, + { + "epoch": 4.567068668471573, + "grad_norm": 0.9496204853057861, + "learning_rate": 6.77738253721894e-07, + "loss": 0.548, + "step": 12372 + }, + { + "epoch": 4.567437853802609, + "grad_norm": 0.9451133608818054, + "learning_rate": 6.774053085824423e-07, + "loss": 0.5638, + "step": 12373 + }, + { + "epoch": 4.567807039133645, + "grad_norm": 0.9597036838531494, + "learning_rate": 6.770724324276234e-07, + "loss": 0.5607, + "step": 12374 + }, + { + "epoch": 4.568176224464681, + "grad_norm": 0.9770978689193726, + "learning_rate": 6.767396252700384e-07, + "loss": 0.58, + "step": 12375 + }, + { + "epoch": 4.568545409795718, + "grad_norm": 0.9652143716812134, + "learning_rate": 6.764068871222825e-07, + "loss": 0.5388, + "step": 12376 + }, + { + "epoch": 4.568914595126754, + "grad_norm": 0.9582011699676514, + "learning_rate": 6.760742179969518e-07, + "loss": 0.5762, + "step": 12377 + }, + { + "epoch": 4.5692837804577895, + "grad_norm": 0.9663056135177612, + "learning_rate": 6.757416179066348e-07, + "loss": 0.5717, + "step": 12378 + }, + { + "epoch": 4.569652965788826, + "grad_norm": 0.9449358582496643, + "learning_rate": 6.754090868639221e-07, + "loss": 0.5496, + "step": 12379 + }, + { + "epoch": 4.570022151119862, + "grad_norm": 0.9567338228225708, + "learning_rate": 6.750766248814001e-07, + "loss": 0.5824, + "step": 12380 + }, + { + "epoch": 4.570391336450898, + "grad_norm": 0.9782307744026184, + "learning_rate": 6.747442319716518e-07, + "loss": 0.5704, + "step": 12381 + }, + { + "epoch": 4.570760521781935, + "grad_norm": 0.9588723182678223, + "learning_rate": 6.744119081472578e-07, + "loss": 0.5989, + "step": 12382 + }, + { + "epoch": 4.571129707112971, + "grad_norm": 0.9455458521842957, + "learning_rate": 6.740796534207974e-07, + "loss": 0.6032, + "step": 12383 + }, + { + "epoch": 4.571498892444007, + "grad_norm": 0.975827693939209, + "learning_rate": 6.737474678048453e-07, + "loss": 0.5746, + "step": 12384 + }, + { + "epoch": 4.571868077775044, + "grad_norm": 0.9575393199920654, + "learning_rate": 6.734153513119765e-07, + "loss": 0.5599, + "step": 12385 + }, + { + "epoch": 4.5722372631060795, + "grad_norm": 0.9516708850860596, + "learning_rate": 6.730833039547588e-07, + "loss": 0.5654, + "step": 12386 + }, + { + "epoch": 4.572606448437115, + "grad_norm": 0.9473376274108887, + "learning_rate": 6.727513257457613e-07, + "loss": 0.5723, + "step": 12387 + }, + { + "epoch": 4.572975633768151, + "grad_norm": 0.9485699534416199, + "learning_rate": 6.724194166975503e-07, + "loss": 0.5605, + "step": 12388 + }, + { + "epoch": 4.573344819099188, + "grad_norm": 0.9577664732933044, + "learning_rate": 6.720875768226876e-07, + "loss": 0.5599, + "step": 12389 + }, + { + "epoch": 4.573714004430224, + "grad_norm": 0.9736403226852417, + "learning_rate": 6.717558061337331e-07, + "loss": 0.5571, + "step": 12390 + }, + { + "epoch": 4.57408318976126, + "grad_norm": 0.9421654343605042, + "learning_rate": 6.714241046432437e-07, + "loss": 0.5685, + "step": 12391 + }, + { + "epoch": 4.574452375092296, + "grad_norm": 0.9212653040885925, + "learning_rate": 6.710924723637757e-07, + "loss": 0.5703, + "step": 12392 + }, + { + "epoch": 4.574821560423333, + "grad_norm": 1.0326344966888428, + "learning_rate": 6.707609093078805e-07, + "loss": 0.5507, + "step": 12393 + }, + { + "epoch": 4.575190745754369, + "grad_norm": 0.9100222587585449, + "learning_rate": 6.704294154881064e-07, + "loss": 0.5771, + "step": 12394 + }, + { + "epoch": 4.5755599310854045, + "grad_norm": 0.9653185606002808, + "learning_rate": 6.700979909170025e-07, + "loss": 0.5677, + "step": 12395 + }, + { + "epoch": 4.575929116416441, + "grad_norm": 0.977069079875946, + "learning_rate": 6.697666356071111e-07, + "loss": 0.5692, + "step": 12396 + }, + { + "epoch": 4.576298301747477, + "grad_norm": 0.9844152927398682, + "learning_rate": 6.69435349570976e-07, + "loss": 0.5934, + "step": 12397 + }, + { + "epoch": 4.576667487078513, + "grad_norm": 0.9594762921333313, + "learning_rate": 6.691041328211351e-07, + "loss": 0.5903, + "step": 12398 + }, + { + "epoch": 4.57703667240955, + "grad_norm": 0.9830948114395142, + "learning_rate": 6.687729853701242e-07, + "loss": 0.527, + "step": 12399 + }, + { + "epoch": 4.577405857740586, + "grad_norm": 0.9504079222679138, + "learning_rate": 6.684419072304785e-07, + "loss": 0.5995, + "step": 12400 + }, + { + "epoch": 4.577775043071622, + "grad_norm": 0.9492847919464111, + "learning_rate": 6.681108984147286e-07, + "loss": 0.557, + "step": 12401 + }, + { + "epoch": 4.5781442284026586, + "grad_norm": 0.9570289850234985, + "learning_rate": 6.677799589354022e-07, + "loss": 0.5956, + "step": 12402 + }, + { + "epoch": 4.5785134137336945, + "grad_norm": 0.9705991148948669, + "learning_rate": 6.674490888050269e-07, + "loss": 0.5922, + "step": 12403 + }, + { + "epoch": 4.57888259906473, + "grad_norm": 0.9696580767631531, + "learning_rate": 6.671182880361246e-07, + "loss": 0.5552, + "step": 12404 + }, + { + "epoch": 4.579251784395766, + "grad_norm": 0.9453771114349365, + "learning_rate": 6.667875566412182e-07, + "loss": 0.6027, + "step": 12405 + }, + { + "epoch": 4.579620969726803, + "grad_norm": 0.9563854932785034, + "learning_rate": 6.664568946328226e-07, + "loss": 0.603, + "step": 12406 + }, + { + "epoch": 4.579990155057839, + "grad_norm": 0.9659749269485474, + "learning_rate": 6.661263020234548e-07, + "loss": 0.5601, + "step": 12407 + }, + { + "epoch": 4.580359340388875, + "grad_norm": 0.9403030872344971, + "learning_rate": 6.657957788256283e-07, + "loss": 0.5147, + "step": 12408 + }, + { + "epoch": 4.580728525719912, + "grad_norm": 0.9392856955528259, + "learning_rate": 6.65465325051853e-07, + "loss": 0.5478, + "step": 12409 + }, + { + "epoch": 4.581097711050948, + "grad_norm": 0.9587939977645874, + "learning_rate": 6.65134940714636e-07, + "loss": 0.5569, + "step": 12410 + }, + { + "epoch": 4.581466896381984, + "grad_norm": 0.9734517335891724, + "learning_rate": 6.648046258264817e-07, + "loss": 0.547, + "step": 12411 + }, + { + "epoch": 4.5818360817130195, + "grad_norm": 0.9302161931991577, + "learning_rate": 6.644743803998938e-07, + "loss": 0.5887, + "step": 12412 + }, + { + "epoch": 4.582205267044056, + "grad_norm": 0.9679258465766907, + "learning_rate": 6.641442044473714e-07, + "loss": 0.5724, + "step": 12413 + }, + { + "epoch": 4.582574452375092, + "grad_norm": 1.0331815481185913, + "learning_rate": 6.638140979814106e-07, + "loss": 0.5713, + "step": 12414 + }, + { + "epoch": 4.582943637706128, + "grad_norm": 0.9312252402305603, + "learning_rate": 6.634840610145069e-07, + "loss": 0.5415, + "step": 12415 + }, + { + "epoch": 4.583312823037165, + "grad_norm": 0.9239019751548767, + "learning_rate": 6.631540935591524e-07, + "loss": 0.5789, + "step": 12416 + }, + { + "epoch": 4.583682008368201, + "grad_norm": 0.9623661041259766, + "learning_rate": 6.628241956278359e-07, + "loss": 0.5584, + "step": 12417 + }, + { + "epoch": 4.584051193699237, + "grad_norm": 0.9674127101898193, + "learning_rate": 6.624943672330437e-07, + "loss": 0.5957, + "step": 12418 + }, + { + "epoch": 4.5844203790302736, + "grad_norm": 0.9818588495254517, + "learning_rate": 6.621646083872587e-07, + "loss": 0.5519, + "step": 12419 + }, + { + "epoch": 4.5847895643613095, + "grad_norm": 0.9589970707893372, + "learning_rate": 6.618349191029644e-07, + "loss": 0.5654, + "step": 12420 + }, + { + "epoch": 4.585158749692345, + "grad_norm": 0.9452071189880371, + "learning_rate": 6.615052993926382e-07, + "loss": 0.6172, + "step": 12421 + }, + { + "epoch": 4.585527935023382, + "grad_norm": 0.9448072910308838, + "learning_rate": 6.611757492687554e-07, + "loss": 0.5552, + "step": 12422 + }, + { + "epoch": 4.585897120354418, + "grad_norm": 0.9618515968322754, + "learning_rate": 6.608462687437909e-07, + "loss": 0.6094, + "step": 12423 + }, + { + "epoch": 4.586266305685454, + "grad_norm": 0.9721788167953491, + "learning_rate": 6.605168578302141e-07, + "loss": 0.5924, + "step": 12424 + }, + { + "epoch": 4.58663549101649, + "grad_norm": 0.9499609470367432, + "learning_rate": 6.601875165404941e-07, + "loss": 0.5944, + "step": 12425 + }, + { + "epoch": 4.587004676347527, + "grad_norm": 0.9685709476470947, + "learning_rate": 6.598582448870961e-07, + "loss": 0.5908, + "step": 12426 + }, + { + "epoch": 4.587373861678563, + "grad_norm": 0.9713255167007446, + "learning_rate": 6.595290428824821e-07, + "loss": 0.5467, + "step": 12427 + }, + { + "epoch": 4.587743047009599, + "grad_norm": 0.9583146572113037, + "learning_rate": 6.591999105391137e-07, + "loss": 0.5698, + "step": 12428 + }, + { + "epoch": 4.588112232340635, + "grad_norm": 0.9462921023368835, + "learning_rate": 6.588708478694475e-07, + "loss": 0.5685, + "step": 12429 + }, + { + "epoch": 4.588481417671671, + "grad_norm": 0.9608304500579834, + "learning_rate": 6.585418548859381e-07, + "loss": 0.5824, + "step": 12430 + }, + { + "epoch": 4.588850603002707, + "grad_norm": 0.9687280058860779, + "learning_rate": 6.582129316010391e-07, + "loss": 0.5606, + "step": 12431 + }, + { + "epoch": 4.589219788333743, + "grad_norm": 0.9619055986404419, + "learning_rate": 6.578840780271984e-07, + "loss": 0.5421, + "step": 12432 + }, + { + "epoch": 4.58958897366478, + "grad_norm": 0.965300440788269, + "learning_rate": 6.575552941768657e-07, + "loss": 0.593, + "step": 12433 + }, + { + "epoch": 4.589958158995816, + "grad_norm": 0.9191052913665771, + "learning_rate": 6.572265800624819e-07, + "loss": 0.5936, + "step": 12434 + }, + { + "epoch": 4.590327344326852, + "grad_norm": 0.97463059425354, + "learning_rate": 6.568979356964908e-07, + "loss": 0.5671, + "step": 12435 + }, + { + "epoch": 4.5906965296578885, + "grad_norm": 0.9815130829811096, + "learning_rate": 6.565693610913316e-07, + "loss": 0.5645, + "step": 12436 + }, + { + "epoch": 4.5910657149889245, + "grad_norm": 0.9571602940559387, + "learning_rate": 6.562408562594402e-07, + "loss": 0.5762, + "step": 12437 + }, + { + "epoch": 4.59143490031996, + "grad_norm": 0.9439051151275635, + "learning_rate": 6.559124212132509e-07, + "loss": 0.5743, + "step": 12438 + }, + { + "epoch": 4.591804085650997, + "grad_norm": 0.977609395980835, + "learning_rate": 6.555840559651933e-07, + "loss": 0.5967, + "step": 12439 + }, + { + "epoch": 4.592173270982033, + "grad_norm": 0.9778510928153992, + "learning_rate": 6.55255760527698e-07, + "loss": 0.5776, + "step": 12440 + }, + { + "epoch": 4.592542456313069, + "grad_norm": 0.9654958844184875, + "learning_rate": 6.5492753491319e-07, + "loss": 0.5686, + "step": 12441 + }, + { + "epoch": 4.592911641644106, + "grad_norm": 0.9555073380470276, + "learning_rate": 6.545993791340918e-07, + "loss": 0.5615, + "step": 12442 + }, + { + "epoch": 4.593280826975142, + "grad_norm": 0.963968813419342, + "learning_rate": 6.542712932028253e-07, + "loss": 0.5731, + "step": 12443 + }, + { + "epoch": 4.593650012306178, + "grad_norm": 0.9856855869293213, + "learning_rate": 6.539432771318069e-07, + "loss": 0.6226, + "step": 12444 + }, + { + "epoch": 4.594019197637214, + "grad_norm": 0.9474522471427917, + "learning_rate": 6.536153309334539e-07, + "loss": 0.5775, + "step": 12445 + }, + { + "epoch": 4.59438838296825, + "grad_norm": 0.9689680337905884, + "learning_rate": 6.532874546201778e-07, + "loss": 0.5758, + "step": 12446 + }, + { + "epoch": 4.594757568299286, + "grad_norm": 0.9744295477867126, + "learning_rate": 6.529596482043879e-07, + "loss": 0.5727, + "step": 12447 + }, + { + "epoch": 4.595126753630322, + "grad_norm": 0.9641205072402954, + "learning_rate": 6.526319116984934e-07, + "loss": 0.5841, + "step": 12448 + }, + { + "epoch": 4.595495938961358, + "grad_norm": 0.9801648855209351, + "learning_rate": 6.523042451148975e-07, + "loss": 0.5997, + "step": 12449 + }, + { + "epoch": 4.595865124292395, + "grad_norm": 0.998727560043335, + "learning_rate": 6.519766484660023e-07, + "loss": 0.579, + "step": 12450 + }, + { + "epoch": 4.596234309623431, + "grad_norm": 0.9475196003913879, + "learning_rate": 6.516491217642085e-07, + "loss": 0.5937, + "step": 12451 + }, + { + "epoch": 4.596603494954467, + "grad_norm": 0.994011402130127, + "learning_rate": 6.513216650219115e-07, + "loss": 0.5877, + "step": 12452 + }, + { + "epoch": 4.5969726802855035, + "grad_norm": 0.9543420672416687, + "learning_rate": 6.509942782515063e-07, + "loss": 0.5945, + "step": 12453 + }, + { + "epoch": 4.5973418656165395, + "grad_norm": 0.9882544279098511, + "learning_rate": 6.506669614653847e-07, + "loss": 0.5656, + "step": 12454 + }, + { + "epoch": 4.597711050947575, + "grad_norm": 0.9748114347457886, + "learning_rate": 6.503397146759338e-07, + "loss": 0.5402, + "step": 12455 + }, + { + "epoch": 4.598080236278612, + "grad_norm": 0.9689405560493469, + "learning_rate": 6.500125378955419e-07, + "loss": 0.5581, + "step": 12456 + }, + { + "epoch": 4.598449421609648, + "grad_norm": 0.9386009573936462, + "learning_rate": 6.496854311365914e-07, + "loss": 0.5355, + "step": 12457 + }, + { + "epoch": 4.598818606940684, + "grad_norm": 0.9512738585472107, + "learning_rate": 6.493583944114626e-07, + "loss": 0.5823, + "step": 12458 + }, + { + "epoch": 4.599187792271721, + "grad_norm": 0.9419070482254028, + "learning_rate": 6.490314277325355e-07, + "loss": 0.5835, + "step": 12459 + }, + { + "epoch": 4.599556977602757, + "grad_norm": 0.961972713470459, + "learning_rate": 6.487045311121843e-07, + "loss": 0.6062, + "step": 12460 + }, + { + "epoch": 4.599926162933793, + "grad_norm": 0.9812614321708679, + "learning_rate": 6.483777045627826e-07, + "loss": 0.5748, + "step": 12461 + }, + { + "epoch": 4.600295348264829, + "grad_norm": 0.9465078711509705, + "learning_rate": 6.480509480966995e-07, + "loss": 0.5775, + "step": 12462 + }, + { + "epoch": 4.600664533595865, + "grad_norm": 0.9573729634284973, + "learning_rate": 6.477242617263038e-07, + "loss": 0.5713, + "step": 12463 + }, + { + "epoch": 4.601033718926901, + "grad_norm": 0.9580425024032593, + "learning_rate": 6.473976454639608e-07, + "loss": 0.5833, + "step": 12464 + }, + { + "epoch": 4.601402904257937, + "grad_norm": 0.9482818245887756, + "learning_rate": 6.470710993220322e-07, + "loss": 0.5818, + "step": 12465 + }, + { + "epoch": 4.601772089588974, + "grad_norm": 0.9637466669082642, + "learning_rate": 6.467446233128776e-07, + "loss": 0.6071, + "step": 12466 + }, + { + "epoch": 4.60214127492001, + "grad_norm": 0.9218672513961792, + "learning_rate": 6.464182174488534e-07, + "loss": 0.5936, + "step": 12467 + }, + { + "epoch": 4.602510460251046, + "grad_norm": 0.9773741960525513, + "learning_rate": 6.460918817423157e-07, + "loss": 0.5693, + "step": 12468 + }, + { + "epoch": 4.602879645582082, + "grad_norm": 0.9466784000396729, + "learning_rate": 6.457656162056147e-07, + "loss": 0.594, + "step": 12469 + }, + { + "epoch": 4.6032488309131185, + "grad_norm": 0.9820647239685059, + "learning_rate": 6.454394208510995e-07, + "loss": 0.5674, + "step": 12470 + }, + { + "epoch": 4.6036180162441545, + "grad_norm": 0.9481598138809204, + "learning_rate": 6.451132956911177e-07, + "loss": 0.5893, + "step": 12471 + }, + { + "epoch": 4.60398720157519, + "grad_norm": 0.9694433212280273, + "learning_rate": 6.447872407380113e-07, + "loss": 0.5594, + "step": 12472 + }, + { + "epoch": 4.604356386906227, + "grad_norm": 0.9639301300048828, + "learning_rate": 6.444612560041232e-07, + "loss": 0.5872, + "step": 12473 + }, + { + "epoch": 4.604725572237263, + "grad_norm": 0.981911301612854, + "learning_rate": 6.441353415017909e-07, + "loss": 0.5946, + "step": 12474 + }, + { + "epoch": 4.605094757568299, + "grad_norm": 0.9921848773956299, + "learning_rate": 6.438094972433495e-07, + "loss": 0.5602, + "step": 12475 + }, + { + "epoch": 4.605463942899336, + "grad_norm": 0.9646561741828918, + "learning_rate": 6.434837232411334e-07, + "loss": 0.5606, + "step": 12476 + }, + { + "epoch": 4.605833128230372, + "grad_norm": 0.9544456005096436, + "learning_rate": 6.431580195074727e-07, + "loss": 0.601, + "step": 12477 + }, + { + "epoch": 4.606202313561408, + "grad_norm": 0.9567283987998962, + "learning_rate": 6.428323860546939e-07, + "loss": 0.5921, + "step": 12478 + }, + { + "epoch": 4.606571498892444, + "grad_norm": 0.9894390106201172, + "learning_rate": 6.425068228951243e-07, + "loss": 0.5717, + "step": 12479 + }, + { + "epoch": 4.60694068422348, + "grad_norm": 0.9647639393806458, + "learning_rate": 6.421813300410848e-07, + "loss": 0.5614, + "step": 12480 + }, + { + "epoch": 4.607309869554516, + "grad_norm": 0.9519038796424866, + "learning_rate": 6.418559075048958e-07, + "loss": 0.5685, + "step": 12481 + }, + { + "epoch": 4.607679054885552, + "grad_norm": 0.9845471978187561, + "learning_rate": 6.415305552988735e-07, + "loss": 0.6106, + "step": 12482 + }, + { + "epoch": 4.608048240216589, + "grad_norm": 0.9749199151992798, + "learning_rate": 6.412052734353333e-07, + "loss": 0.5976, + "step": 12483 + }, + { + "epoch": 4.608417425547625, + "grad_norm": 0.9391535520553589, + "learning_rate": 6.408800619265873e-07, + "loss": 0.5322, + "step": 12484 + }, + { + "epoch": 4.608786610878661, + "grad_norm": 0.9316312074661255, + "learning_rate": 6.405549207849443e-07, + "loss": 0.5667, + "step": 12485 + }, + { + "epoch": 4.609155796209698, + "grad_norm": 0.9582907557487488, + "learning_rate": 6.402298500227109e-07, + "loss": 0.5775, + "step": 12486 + }, + { + "epoch": 4.6095249815407335, + "grad_norm": 0.9483131766319275, + "learning_rate": 6.399048496521898e-07, + "loss": 0.592, + "step": 12487 + }, + { + "epoch": 4.6098941668717694, + "grad_norm": 0.9555020928382874, + "learning_rate": 6.395799196856838e-07, + "loss": 0.5717, + "step": 12488 + }, + { + "epoch": 4.610263352202805, + "grad_norm": 0.961122989654541, + "learning_rate": 6.392550601354908e-07, + "loss": 0.5806, + "step": 12489 + }, + { + "epoch": 4.610632537533842, + "grad_norm": 0.9683997631072998, + "learning_rate": 6.389302710139056e-07, + "loss": 0.5739, + "step": 12490 + }, + { + "epoch": 4.611001722864878, + "grad_norm": 0.9432030916213989, + "learning_rate": 6.386055523332224e-07, + "loss": 0.5504, + "step": 12491 + }, + { + "epoch": 4.611370908195914, + "grad_norm": 0.9166076183319092, + "learning_rate": 6.382809041057325e-07, + "loss": 0.5571, + "step": 12492 + }, + { + "epoch": 4.611740093526951, + "grad_norm": 0.9705665111541748, + "learning_rate": 6.379563263437224e-07, + "loss": 0.6069, + "step": 12493 + }, + { + "epoch": 4.612109278857987, + "grad_norm": 0.933463990688324, + "learning_rate": 6.376318190594783e-07, + "loss": 0.5613, + "step": 12494 + }, + { + "epoch": 4.612478464189023, + "grad_norm": 0.9493824243545532, + "learning_rate": 6.373073822652809e-07, + "loss": 0.548, + "step": 12495 + }, + { + "epoch": 4.612847649520059, + "grad_norm": 0.9454423189163208, + "learning_rate": 6.36983015973412e-07, + "loss": 0.5827, + "step": 12496 + }, + { + "epoch": 4.613216834851095, + "grad_norm": 0.9706950187683105, + "learning_rate": 6.366587201961477e-07, + "loss": 0.5682, + "step": 12497 + }, + { + "epoch": 4.613586020182131, + "grad_norm": 0.9378330707550049, + "learning_rate": 6.363344949457623e-07, + "loss": 0.5415, + "step": 12498 + }, + { + "epoch": 4.613955205513168, + "grad_norm": 0.953632116317749, + "learning_rate": 6.360103402345289e-07, + "loss": 0.5993, + "step": 12499 + }, + { + "epoch": 4.614324390844204, + "grad_norm": 0.9610472917556763, + "learning_rate": 6.356862560747156e-07, + "loss": 0.5362, + "step": 12500 + }, + { + "epoch": 4.61469357617524, + "grad_norm": 0.9698344469070435, + "learning_rate": 6.353622424785888e-07, + "loss": 0.5227, + "step": 12501 + }, + { + "epoch": 4.615062761506276, + "grad_norm": 0.9476434588432312, + "learning_rate": 6.350382994584131e-07, + "loss": 0.5543, + "step": 12502 + }, + { + "epoch": 4.615431946837313, + "grad_norm": 0.9484695196151733, + "learning_rate": 6.347144270264482e-07, + "loss": 0.5677, + "step": 12503 + }, + { + "epoch": 4.6158011321683485, + "grad_norm": 0.9500684142112732, + "learning_rate": 6.343906251949547e-07, + "loss": 0.5731, + "step": 12504 + }, + { + "epoch": 4.6161703174993844, + "grad_norm": 0.9640185832977295, + "learning_rate": 6.340668939761871e-07, + "loss": 0.5469, + "step": 12505 + }, + { + "epoch": 4.616539502830421, + "grad_norm": 0.974351167678833, + "learning_rate": 6.337432333823979e-07, + "loss": 0.5735, + "step": 12506 + }, + { + "epoch": 4.616908688161457, + "grad_norm": 0.993827760219574, + "learning_rate": 6.334196434258389e-07, + "loss": 0.6111, + "step": 12507 + }, + { + "epoch": 4.617277873492493, + "grad_norm": 0.9626508951187134, + "learning_rate": 6.330961241187575e-07, + "loss": 0.5917, + "step": 12508 + }, + { + "epoch": 4.617647058823529, + "grad_norm": 0.9223693609237671, + "learning_rate": 6.327726754733984e-07, + "loss": 0.5444, + "step": 12509 + }, + { + "epoch": 4.618016244154566, + "grad_norm": 0.9509323835372925, + "learning_rate": 6.324492975020039e-07, + "loss": 0.5485, + "step": 12510 + }, + { + "epoch": 4.618385429485602, + "grad_norm": 0.9273017644882202, + "learning_rate": 6.321259902168136e-07, + "loss": 0.5569, + "step": 12511 + }, + { + "epoch": 4.618754614816638, + "grad_norm": 0.9645373225212097, + "learning_rate": 6.318027536300661e-07, + "loss": 0.5911, + "step": 12512 + }, + { + "epoch": 4.619123800147674, + "grad_norm": 0.9732146263122559, + "learning_rate": 6.314795877539947e-07, + "loss": 0.5696, + "step": 12513 + }, + { + "epoch": 4.61949298547871, + "grad_norm": 0.9607918858528137, + "learning_rate": 6.311564926008312e-07, + "loss": 0.5635, + "step": 12514 + }, + { + "epoch": 4.619862170809746, + "grad_norm": 0.9490680694580078, + "learning_rate": 6.308334681828038e-07, + "loss": 0.5716, + "step": 12515 + }, + { + "epoch": 4.620231356140783, + "grad_norm": 0.9729613661766052, + "learning_rate": 6.305105145121404e-07, + "loss": 0.575, + "step": 12516 + }, + { + "epoch": 4.620600541471819, + "grad_norm": 0.9609436392784119, + "learning_rate": 6.301876316010642e-07, + "loss": 0.5465, + "step": 12517 + }, + { + "epoch": 4.620969726802855, + "grad_norm": 0.9528532028198242, + "learning_rate": 6.298648194617951e-07, + "loss": 0.6027, + "step": 12518 + }, + { + "epoch": 4.621338912133892, + "grad_norm": 0.9786722660064697, + "learning_rate": 6.295420781065534e-07, + "loss": 0.561, + "step": 12519 + }, + { + "epoch": 4.621708097464928, + "grad_norm": 0.9513490796089172, + "learning_rate": 6.292194075475536e-07, + "loss": 0.5661, + "step": 12520 + }, + { + "epoch": 4.6220772827959635, + "grad_norm": 0.9634760022163391, + "learning_rate": 6.288968077970078e-07, + "loss": 0.5544, + "step": 12521 + }, + { + "epoch": 4.622446468126999, + "grad_norm": 0.9723106026649475, + "learning_rate": 6.285742788671281e-07, + "loss": 0.5632, + "step": 12522 + }, + { + "epoch": 4.622815653458036, + "grad_norm": 0.9633849263191223, + "learning_rate": 6.282518207701208e-07, + "loss": 0.5448, + "step": 12523 + }, + { + "epoch": 4.623184838789072, + "grad_norm": 0.9712866544723511, + "learning_rate": 6.279294335181921e-07, + "loss": 0.5869, + "step": 12524 + }, + { + "epoch": 4.623554024120108, + "grad_norm": 0.9628087282180786, + "learning_rate": 6.276071171235432e-07, + "loss": 0.588, + "step": 12525 + }, + { + "epoch": 4.623923209451144, + "grad_norm": 0.9872446060180664, + "learning_rate": 6.272848715983734e-07, + "loss": 0.575, + "step": 12526 + }, + { + "epoch": 4.624292394782181, + "grad_norm": 0.9293995499610901, + "learning_rate": 6.269626969548812e-07, + "loss": 0.5536, + "step": 12527 + }, + { + "epoch": 4.624661580113217, + "grad_norm": 0.9402612447738647, + "learning_rate": 6.266405932052594e-07, + "loss": 0.5642, + "step": 12528 + }, + { + "epoch": 4.625030765444253, + "grad_norm": 0.9546994566917419, + "learning_rate": 6.263185603616992e-07, + "loss": 0.5639, + "step": 12529 + }, + { + "epoch": 4.625399950775289, + "grad_norm": 0.9608020186424255, + "learning_rate": 6.259965984363911e-07, + "loss": 0.5802, + "step": 12530 + }, + { + "epoch": 4.625769136106325, + "grad_norm": 0.9868547916412354, + "learning_rate": 6.256747074415193e-07, + "loss": 0.5804, + "step": 12531 + }, + { + "epoch": 4.626138321437361, + "grad_norm": 0.9682538509368896, + "learning_rate": 6.253528873892692e-07, + "loss": 0.6028, + "step": 12532 + }, + { + "epoch": 4.626507506768398, + "grad_norm": 0.9609774947166443, + "learning_rate": 6.250311382918206e-07, + "loss": 0.5991, + "step": 12533 + }, + { + "epoch": 4.626876692099434, + "grad_norm": 0.9598527550697327, + "learning_rate": 6.24709460161351e-07, + "loss": 0.5852, + "step": 12534 + }, + { + "epoch": 4.62724587743047, + "grad_norm": 0.9492047429084778, + "learning_rate": 6.243878530100372e-07, + "loss": 0.5892, + "step": 12535 + }, + { + "epoch": 4.627615062761507, + "grad_norm": 0.9615417718887329, + "learning_rate": 6.24066316850051e-07, + "loss": 0.5494, + "step": 12536 + }, + { + "epoch": 4.627984248092543, + "grad_norm": 0.9583923816680908, + "learning_rate": 6.237448516935629e-07, + "loss": 0.6, + "step": 12537 + }, + { + "epoch": 4.6283534334235785, + "grad_norm": 0.9814584851264954, + "learning_rate": 6.234234575527393e-07, + "loss": 0.5926, + "step": 12538 + }, + { + "epoch": 4.628722618754614, + "grad_norm": 0.9488547444343567, + "learning_rate": 6.231021344397453e-07, + "loss": 0.5715, + "step": 12539 + }, + { + "epoch": 4.629091804085651, + "grad_norm": 0.9725069999694824, + "learning_rate": 6.22780882366745e-07, + "loss": 0.6117, + "step": 12540 + }, + { + "epoch": 4.629460989416687, + "grad_norm": 0.9750696420669556, + "learning_rate": 6.224597013458942e-07, + "loss": 0.591, + "step": 12541 + }, + { + "epoch": 4.629830174747723, + "grad_norm": 0.9560492038726807, + "learning_rate": 6.221385913893519e-07, + "loss": 0.5622, + "step": 12542 + }, + { + "epoch": 4.63019936007876, + "grad_norm": 0.9384058713912964, + "learning_rate": 6.218175525092704e-07, + "loss": 0.5978, + "step": 12543 + }, + { + "epoch": 4.630568545409796, + "grad_norm": 0.9941636919975281, + "learning_rate": 6.214965847178025e-07, + "loss": 0.5323, + "step": 12544 + }, + { + "epoch": 4.630937730740832, + "grad_norm": 0.9375125765800476, + "learning_rate": 6.211756880270961e-07, + "loss": 0.5831, + "step": 12545 + }, + { + "epoch": 4.631306916071868, + "grad_norm": 0.9532259702682495, + "learning_rate": 6.20854862449296e-07, + "loss": 0.5647, + "step": 12546 + }, + { + "epoch": 4.631676101402904, + "grad_norm": 0.9610616564750671, + "learning_rate": 6.205341079965474e-07, + "loss": 0.5978, + "step": 12547 + }, + { + "epoch": 4.63204528673394, + "grad_norm": 0.9263715744018555, + "learning_rate": 6.202134246809893e-07, + "loss": 0.5555, + "step": 12548 + }, + { + "epoch": 4.632414472064976, + "grad_norm": 0.953551709651947, + "learning_rate": 6.198928125147593e-07, + "loss": 0.5215, + "step": 12549 + }, + { + "epoch": 4.632783657396013, + "grad_norm": 0.9506643414497375, + "learning_rate": 6.195722715099936e-07, + "loss": 0.5731, + "step": 12550 + }, + { + "epoch": 4.633152842727049, + "grad_norm": 0.9726430177688599, + "learning_rate": 6.192518016788234e-07, + "loss": 0.5587, + "step": 12551 + }, + { + "epoch": 4.633522028058085, + "grad_norm": 0.9464359879493713, + "learning_rate": 6.189314030333796e-07, + "loss": 0.5277, + "step": 12552 + }, + { + "epoch": 4.633891213389122, + "grad_norm": 0.9502270817756653, + "learning_rate": 6.186110755857882e-07, + "loss": 0.5395, + "step": 12553 + }, + { + "epoch": 4.634260398720158, + "grad_norm": 0.9413378238677979, + "learning_rate": 6.182908193481734e-07, + "loss": 0.59, + "step": 12554 + }, + { + "epoch": 4.6346295840511935, + "grad_norm": 0.9718813300132751, + "learning_rate": 6.17970634332658e-07, + "loss": 0.6169, + "step": 12555 + }, + { + "epoch": 4.63499876938223, + "grad_norm": 0.969231367111206, + "learning_rate": 6.1765052055136e-07, + "loss": 0.5856, + "step": 12556 + }, + { + "epoch": 4.635367954713266, + "grad_norm": 0.9942222833633423, + "learning_rate": 6.173304780163955e-07, + "loss": 0.5454, + "step": 12557 + }, + { + "epoch": 4.635737140044302, + "grad_norm": 0.9196038246154785, + "learning_rate": 6.17010506739878e-07, + "loss": 0.5491, + "step": 12558 + }, + { + "epoch": 4.636106325375338, + "grad_norm": 0.9674280285835266, + "learning_rate": 6.166906067339181e-07, + "loss": 0.5973, + "step": 12559 + }, + { + "epoch": 4.636475510706375, + "grad_norm": 0.9758324027061462, + "learning_rate": 6.163707780106259e-07, + "loss": 0.5799, + "step": 12560 + }, + { + "epoch": 4.636844696037411, + "grad_norm": 0.998699426651001, + "learning_rate": 6.160510205821038e-07, + "loss": 0.5808, + "step": 12561 + }, + { + "epoch": 4.637213881368447, + "grad_norm": 0.9349461793899536, + "learning_rate": 6.157313344604568e-07, + "loss": 0.5513, + "step": 12562 + }, + { + "epoch": 4.6375830666994835, + "grad_norm": 0.9539383053779602, + "learning_rate": 6.154117196577833e-07, + "loss": 0.5781, + "step": 12563 + }, + { + "epoch": 4.637952252030519, + "grad_norm": 0.9549605250358582, + "learning_rate": 6.150921761861822e-07, + "loss": 0.5447, + "step": 12564 + }, + { + "epoch": 4.638321437361555, + "grad_norm": 0.9102135896682739, + "learning_rate": 6.147727040577475e-07, + "loss": 0.5546, + "step": 12565 + }, + { + "epoch": 4.638690622692591, + "grad_norm": 0.9580401182174683, + "learning_rate": 6.1445330328457e-07, + "loss": 0.5877, + "step": 12566 + }, + { + "epoch": 4.639059808023628, + "grad_norm": 0.9529427886009216, + "learning_rate": 6.1413397387874e-07, + "loss": 0.6009, + "step": 12567 + }, + { + "epoch": 4.639428993354664, + "grad_norm": 0.9486605525016785, + "learning_rate": 6.138147158523455e-07, + "loss": 0.5828, + "step": 12568 + }, + { + "epoch": 4.6397981786857, + "grad_norm": 0.9372040629386902, + "learning_rate": 6.134955292174671e-07, + "loss": 0.5479, + "step": 12569 + }, + { + "epoch": 4.640167364016737, + "grad_norm": 0.959546685218811, + "learning_rate": 6.131764139861882e-07, + "loss": 0.5174, + "step": 12570 + }, + { + "epoch": 4.640536549347773, + "grad_norm": 0.9501592516899109, + "learning_rate": 6.128573701705859e-07, + "loss": 0.5696, + "step": 12571 + }, + { + "epoch": 4.6409057346788085, + "grad_norm": 0.9671998023986816, + "learning_rate": 6.12538397782737e-07, + "loss": 0.5932, + "step": 12572 + }, + { + "epoch": 4.641274920009845, + "grad_norm": 0.9533870220184326, + "learning_rate": 6.122194968347145e-07, + "loss": 0.5763, + "step": 12573 + }, + { + "epoch": 4.641644105340881, + "grad_norm": 0.9709552526473999, + "learning_rate": 6.119006673385871e-07, + "loss": 0.5744, + "step": 12574 + }, + { + "epoch": 4.642013290671917, + "grad_norm": 0.9277927279472351, + "learning_rate": 6.115819093064246e-07, + "loss": 0.5804, + "step": 12575 + }, + { + "epoch": 4.642382476002954, + "grad_norm": 0.9605311155319214, + "learning_rate": 6.112632227502907e-07, + "loss": 0.552, + "step": 12576 + }, + { + "epoch": 4.64275166133399, + "grad_norm": 0.9556404948234558, + "learning_rate": 6.10944607682247e-07, + "loss": 0.5489, + "step": 12577 + }, + { + "epoch": 4.643120846665026, + "grad_norm": 0.9761450290679932, + "learning_rate": 6.106260641143547e-07, + "loss": 0.5975, + "step": 12578 + }, + { + "epoch": 4.643490031996062, + "grad_norm": 0.9451001286506653, + "learning_rate": 6.103075920586685e-07, + "loss": 0.5594, + "step": 12579 + }, + { + "epoch": 4.6438592173270985, + "grad_norm": 0.9486192464828491, + "learning_rate": 6.099891915272451e-07, + "loss": 0.627, + "step": 12580 + }, + { + "epoch": 4.644228402658134, + "grad_norm": 1.0112926959991455, + "learning_rate": 6.096708625321329e-07, + "loss": 0.5859, + "step": 12581 + }, + { + "epoch": 4.64459758798917, + "grad_norm": 0.9952248930931091, + "learning_rate": 6.093526050853818e-07, + "loss": 0.5961, + "step": 12582 + }, + { + "epoch": 4.644966773320207, + "grad_norm": 0.9734710454940796, + "learning_rate": 6.090344191990388e-07, + "loss": 0.5823, + "step": 12583 + }, + { + "epoch": 4.645335958651243, + "grad_norm": 0.9700580835342407, + "learning_rate": 6.087163048851461e-07, + "loss": 0.5941, + "step": 12584 + }, + { + "epoch": 4.645705143982279, + "grad_norm": 0.9890457987785339, + "learning_rate": 6.083982621557444e-07, + "loss": 0.5899, + "step": 12585 + }, + { + "epoch": 4.646074329313315, + "grad_norm": 0.9619045853614807, + "learning_rate": 6.080802910228709e-07, + "loss": 0.5812, + "step": 12586 + }, + { + "epoch": 4.646443514644352, + "grad_norm": 0.9684261083602905, + "learning_rate": 6.077623914985611e-07, + "loss": 0.5577, + "step": 12587 + }, + { + "epoch": 4.646812699975388, + "grad_norm": 0.9447489976882935, + "learning_rate": 6.074445635948492e-07, + "loss": 0.6008, + "step": 12588 + }, + { + "epoch": 4.6471818853064235, + "grad_norm": 0.9890772700309753, + "learning_rate": 6.071268073237618e-07, + "loss": 0.571, + "step": 12589 + }, + { + "epoch": 4.64755107063746, + "grad_norm": 0.9974474310874939, + "learning_rate": 6.06809122697328e-07, + "loss": 0.5507, + "step": 12590 + }, + { + "epoch": 4.647920255968496, + "grad_norm": 0.9209325909614563, + "learning_rate": 6.064915097275706e-07, + "loss": 0.5524, + "step": 12591 + }, + { + "epoch": 4.648289441299532, + "grad_norm": 0.9292459487915039, + "learning_rate": 6.061739684265128e-07, + "loss": 0.5556, + "step": 12592 + }, + { + "epoch": 4.648658626630569, + "grad_norm": 0.960205614566803, + "learning_rate": 6.058564988061725e-07, + "loss": 0.5572, + "step": 12593 + }, + { + "epoch": 4.649027811961605, + "grad_norm": 0.9564794301986694, + "learning_rate": 6.055391008785653e-07, + "loss": 0.5672, + "step": 12594 + }, + { + "epoch": 4.649396997292641, + "grad_norm": 0.9362913370132446, + "learning_rate": 6.052217746557059e-07, + "loss": 0.5581, + "step": 12595 + }, + { + "epoch": 4.649766182623678, + "grad_norm": 0.9730167388916016, + "learning_rate": 6.049045201496042e-07, + "loss": 0.5625, + "step": 12596 + }, + { + "epoch": 4.6501353679547135, + "grad_norm": 0.9756707549095154, + "learning_rate": 6.045873373722677e-07, + "loss": 0.5674, + "step": 12597 + }, + { + "epoch": 4.650504553285749, + "grad_norm": 0.9647897481918335, + "learning_rate": 6.042702263357031e-07, + "loss": 0.587, + "step": 12598 + }, + { + "epoch": 4.650873738616785, + "grad_norm": 0.9626193046569824, + "learning_rate": 6.039531870519113e-07, + "loss": 0.535, + "step": 12599 + }, + { + "epoch": 4.651242923947822, + "grad_norm": 0.9513717889785767, + "learning_rate": 6.036362195328938e-07, + "loss": 0.5848, + "step": 12600 + }, + { + "epoch": 4.651612109278858, + "grad_norm": 0.9705761671066284, + "learning_rate": 6.033193237906468e-07, + "loss": 0.5804, + "step": 12601 + }, + { + "epoch": 4.651981294609894, + "grad_norm": 0.9672080874443054, + "learning_rate": 6.030024998371642e-07, + "loss": 0.5547, + "step": 12602 + }, + { + "epoch": 4.65235047994093, + "grad_norm": 0.9655557870864868, + "learning_rate": 6.026857476844389e-07, + "loss": 0.5808, + "step": 12603 + }, + { + "epoch": 4.652719665271967, + "grad_norm": 0.9573763608932495, + "learning_rate": 6.02369067344459e-07, + "loss": 0.6044, + "step": 12604 + }, + { + "epoch": 4.653088850603003, + "grad_norm": 0.9736968874931335, + "learning_rate": 6.020524588292107e-07, + "loss": 0.5738, + "step": 12605 + }, + { + "epoch": 4.6534580359340385, + "grad_norm": 0.9411924481391907, + "learning_rate": 6.017359221506782e-07, + "loss": 0.572, + "step": 12606 + }, + { + "epoch": 4.653827221265075, + "grad_norm": 0.9611051082611084, + "learning_rate": 6.014194573208415e-07, + "loss": 0.5433, + "step": 12607 + }, + { + "epoch": 4.654196406596111, + "grad_norm": 0.9466496706008911, + "learning_rate": 6.011030643516805e-07, + "loss": 0.5783, + "step": 12608 + }, + { + "epoch": 4.654565591927147, + "grad_norm": 0.9492084383964539, + "learning_rate": 6.007867432551675e-07, + "loss": 0.5793, + "step": 12609 + }, + { + "epoch": 4.654934777258184, + "grad_norm": 0.909889817237854, + "learning_rate": 6.004704940432774e-07, + "loss": 0.523, + "step": 12610 + }, + { + "epoch": 4.65530396258922, + "grad_norm": 0.9394568204879761, + "learning_rate": 6.0015431672798e-07, + "loss": 0.5474, + "step": 12611 + }, + { + "epoch": 4.655673147920256, + "grad_norm": 0.9668317437171936, + "learning_rate": 5.99838211321242e-07, + "loss": 0.5264, + "step": 12612 + }, + { + "epoch": 4.656042333251293, + "grad_norm": 0.9537863731384277, + "learning_rate": 5.995221778350283e-07, + "loss": 0.5742, + "step": 12613 + }, + { + "epoch": 4.6564115185823285, + "grad_norm": 1.0063402652740479, + "learning_rate": 5.992062162812994e-07, + "loss": 0.5543, + "step": 12614 + }, + { + "epoch": 4.656780703913364, + "grad_norm": 0.9719372391700745, + "learning_rate": 5.988903266720153e-07, + "loss": 0.6026, + "step": 12615 + }, + { + "epoch": 4.6571498892444, + "grad_norm": 0.9757847189903259, + "learning_rate": 5.985745090191339e-07, + "loss": 0.5828, + "step": 12616 + }, + { + "epoch": 4.657519074575437, + "grad_norm": 0.9655924439430237, + "learning_rate": 5.982587633346059e-07, + "loss": 0.5431, + "step": 12617 + }, + { + "epoch": 4.657888259906473, + "grad_norm": 0.9911332130432129, + "learning_rate": 5.97943089630384e-07, + "loss": 0.5998, + "step": 12618 + }, + { + "epoch": 4.658257445237509, + "grad_norm": 0.959700882434845, + "learning_rate": 5.976274879184151e-07, + "loss": 0.5719, + "step": 12619 + }, + { + "epoch": 4.658626630568546, + "grad_norm": 0.9738537073135376, + "learning_rate": 5.973119582106466e-07, + "loss": 0.5754, + "step": 12620 + }, + { + "epoch": 4.658995815899582, + "grad_norm": 0.9690144658088684, + "learning_rate": 5.969965005190195e-07, + "loss": 0.5834, + "step": 12621 + }, + { + "epoch": 4.659365001230618, + "grad_norm": 0.9503282904624939, + "learning_rate": 5.966811148554736e-07, + "loss": 0.597, + "step": 12622 + }, + { + "epoch": 4.6597341865616535, + "grad_norm": 0.9594148993492126, + "learning_rate": 5.963658012319476e-07, + "loss": 0.551, + "step": 12623 + }, + { + "epoch": 4.66010337189269, + "grad_norm": 0.9418379664421082, + "learning_rate": 5.960505596603753e-07, + "loss": 0.5483, + "step": 12624 + }, + { + "epoch": 4.660472557223726, + "grad_norm": 0.9339421391487122, + "learning_rate": 5.957353901526878e-07, + "loss": 0.5987, + "step": 12625 + }, + { + "epoch": 4.660841742554762, + "grad_norm": 0.9663743376731873, + "learning_rate": 5.954202927208153e-07, + "loss": 0.5855, + "step": 12626 + }, + { + "epoch": 4.661210927885799, + "grad_norm": 0.949765682220459, + "learning_rate": 5.951052673766827e-07, + "loss": 0.5694, + "step": 12627 + }, + { + "epoch": 4.661580113216835, + "grad_norm": 0.9639202356338501, + "learning_rate": 5.947903141322161e-07, + "loss": 0.5707, + "step": 12628 + }, + { + "epoch": 4.661949298547871, + "grad_norm": 0.9442463517189026, + "learning_rate": 5.944754329993338e-07, + "loss": 0.5583, + "step": 12629 + }, + { + "epoch": 4.662318483878908, + "grad_norm": 0.9395076036453247, + "learning_rate": 5.941606239899545e-07, + "loss": 0.5598, + "step": 12630 + }, + { + "epoch": 4.6626876692099435, + "grad_norm": 0.9296116828918457, + "learning_rate": 5.93845887115995e-07, + "loss": 0.6116, + "step": 12631 + }, + { + "epoch": 4.663056854540979, + "grad_norm": 0.955474317073822, + "learning_rate": 5.935312223893672e-07, + "loss": 0.5695, + "step": 12632 + }, + { + "epoch": 4.663426039872016, + "grad_norm": 0.9489027857780457, + "learning_rate": 5.932166298219807e-07, + "loss": 0.5504, + "step": 12633 + }, + { + "epoch": 4.663795225203052, + "grad_norm": 0.9470047950744629, + "learning_rate": 5.929021094257423e-07, + "loss": 0.5478, + "step": 12634 + }, + { + "epoch": 4.664164410534088, + "grad_norm": 0.9519688487052917, + "learning_rate": 5.925876612125573e-07, + "loss": 0.5668, + "step": 12635 + }, + { + "epoch": 4.664533595865124, + "grad_norm": 0.9862395524978638, + "learning_rate": 5.922732851943286e-07, + "loss": 0.5913, + "step": 12636 + }, + { + "epoch": 4.664902781196161, + "grad_norm": 0.9893099665641785, + "learning_rate": 5.919589813829527e-07, + "loss": 0.5943, + "step": 12637 + }, + { + "epoch": 4.665271966527197, + "grad_norm": 0.9775111079216003, + "learning_rate": 5.916447497903268e-07, + "loss": 0.594, + "step": 12638 + }, + { + "epoch": 4.665641151858233, + "grad_norm": 0.9564189314842224, + "learning_rate": 5.913305904283462e-07, + "loss": 0.5899, + "step": 12639 + }, + { + "epoch": 4.666010337189269, + "grad_norm": 1.0297640562057495, + "learning_rate": 5.910165033089e-07, + "loss": 0.5628, + "step": 12640 + }, + { + "epoch": 4.666379522520305, + "grad_norm": 0.9289674162864685, + "learning_rate": 5.907024884438764e-07, + "loss": 0.5809, + "step": 12641 + }, + { + "epoch": 4.666748707851341, + "grad_norm": 0.9378930330276489, + "learning_rate": 5.903885458451608e-07, + "loss": 0.5688, + "step": 12642 + }, + { + "epoch": 4.667117893182377, + "grad_norm": 0.9754272699356079, + "learning_rate": 5.900746755246364e-07, + "loss": 0.6062, + "step": 12643 + }, + { + "epoch": 4.667487078513414, + "grad_norm": 0.9710173010826111, + "learning_rate": 5.89760877494183e-07, + "loss": 0.5697, + "step": 12644 + }, + { + "epoch": 4.66785626384445, + "grad_norm": 0.9390226006507874, + "learning_rate": 5.894471517656767e-07, + "loss": 0.5429, + "step": 12645 + }, + { + "epoch": 4.668225449175486, + "grad_norm": 0.9425497055053711, + "learning_rate": 5.891334983509933e-07, + "loss": 0.5801, + "step": 12646 + }, + { + "epoch": 4.668594634506523, + "grad_norm": 0.9469896554946899, + "learning_rate": 5.888199172620035e-07, + "loss": 0.5997, + "step": 12647 + }, + { + "epoch": 4.6689638198375585, + "grad_norm": 0.9870802164077759, + "learning_rate": 5.885064085105771e-07, + "loss": 0.5614, + "step": 12648 + }, + { + "epoch": 4.669333005168594, + "grad_norm": 0.9581458568572998, + "learning_rate": 5.881929721085799e-07, + "loss": 0.5737, + "step": 12649 + }, + { + "epoch": 4.669702190499631, + "grad_norm": 0.9524660706520081, + "learning_rate": 5.878796080678747e-07, + "loss": 0.5522, + "step": 12650 + }, + { + "epoch": 4.670071375830667, + "grad_norm": 0.9703357219696045, + "learning_rate": 5.875663164003236e-07, + "loss": 0.5595, + "step": 12651 + }, + { + "epoch": 4.670440561161703, + "grad_norm": 0.9095121622085571, + "learning_rate": 5.872530971177839e-07, + "loss": 0.5389, + "step": 12652 + }, + { + "epoch": 4.67080974649274, + "grad_norm": 0.9528265595436096, + "learning_rate": 5.869399502321099e-07, + "loss": 0.5716, + "step": 12653 + }, + { + "epoch": 4.671178931823776, + "grad_norm": 0.9424470663070679, + "learning_rate": 5.866268757551557e-07, + "loss": 0.5987, + "step": 12654 + }, + { + "epoch": 4.671548117154812, + "grad_norm": 0.9646139740943909, + "learning_rate": 5.8631387369877e-07, + "loss": 0.5944, + "step": 12655 + }, + { + "epoch": 4.671917302485848, + "grad_norm": 0.9597181677818298, + "learning_rate": 5.860009440748013e-07, + "loss": 0.5833, + "step": 12656 + }, + { + "epoch": 4.672286487816884, + "grad_norm": 0.9858836531639099, + "learning_rate": 5.856880868950915e-07, + "loss": 0.5381, + "step": 12657 + }, + { + "epoch": 4.67265567314792, + "grad_norm": 0.9522179365158081, + "learning_rate": 5.853753021714833e-07, + "loss": 0.5677, + "step": 12658 + }, + { + "epoch": 4.673024858478956, + "grad_norm": 0.9411221146583557, + "learning_rate": 5.850625899158166e-07, + "loss": 0.5553, + "step": 12659 + }, + { + "epoch": 4.673394043809992, + "grad_norm": 0.954826295375824, + "learning_rate": 5.847499501399265e-07, + "loss": 0.563, + "step": 12660 + }, + { + "epoch": 4.673763229141029, + "grad_norm": 0.9534344673156738, + "learning_rate": 5.844373828556463e-07, + "loss": 0.5263, + "step": 12661 + }, + { + "epoch": 4.674132414472065, + "grad_norm": 0.9606667757034302, + "learning_rate": 5.841248880748057e-07, + "loss": 0.5715, + "step": 12662 + }, + { + "epoch": 4.674501599803101, + "grad_norm": 0.9527658820152283, + "learning_rate": 5.838124658092337e-07, + "loss": 0.5441, + "step": 12663 + }, + { + "epoch": 4.674870785134138, + "grad_norm": 0.9566532969474792, + "learning_rate": 5.835001160707563e-07, + "loss": 0.5854, + "step": 12664 + }, + { + "epoch": 4.6752399704651735, + "grad_norm": 0.9355337619781494, + "learning_rate": 5.831878388711934e-07, + "loss": 0.5707, + "step": 12665 + }, + { + "epoch": 4.675609155796209, + "grad_norm": 0.9875616431236267, + "learning_rate": 5.828756342223662e-07, + "loss": 0.5775, + "step": 12666 + }, + { + "epoch": 4.675978341127246, + "grad_norm": 0.9425673484802246, + "learning_rate": 5.825635021360909e-07, + "loss": 0.5852, + "step": 12667 + }, + { + "epoch": 4.676347526458282, + "grad_norm": 0.9225761294364929, + "learning_rate": 5.822514426241823e-07, + "loss": 0.5741, + "step": 12668 + }, + { + "epoch": 4.676716711789318, + "grad_norm": 0.9373415112495422, + "learning_rate": 5.819394556984517e-07, + "loss": 0.5472, + "step": 12669 + }, + { + "epoch": 4.677085897120355, + "grad_norm": 0.9335290193557739, + "learning_rate": 5.816275413707064e-07, + "loss": 0.571, + "step": 12670 + }, + { + "epoch": 4.677455082451391, + "grad_norm": 1.0079026222229004, + "learning_rate": 5.813156996527539e-07, + "loss": 0.5813, + "step": 12671 + }, + { + "epoch": 4.677824267782427, + "grad_norm": 0.9124979972839355, + "learning_rate": 5.810039305563967e-07, + "loss": 0.5655, + "step": 12672 + }, + { + "epoch": 4.678193453113463, + "grad_norm": 0.9136505126953125, + "learning_rate": 5.806922340934345e-07, + "loss": 0.5586, + "step": 12673 + }, + { + "epoch": 4.678562638444499, + "grad_norm": 0.9561769962310791, + "learning_rate": 5.80380610275666e-07, + "loss": 0.5726, + "step": 12674 + }, + { + "epoch": 4.678931823775535, + "grad_norm": 0.9755136966705322, + "learning_rate": 5.800690591148849e-07, + "loss": 0.5722, + "step": 12675 + }, + { + "epoch": 4.679301009106571, + "grad_norm": 0.9411891102790833, + "learning_rate": 5.797575806228847e-07, + "loss": 0.5731, + "step": 12676 + }, + { + "epoch": 4.679670194437608, + "grad_norm": 0.9603943824768066, + "learning_rate": 5.794461748114541e-07, + "loss": 0.5861, + "step": 12677 + }, + { + "epoch": 4.680039379768644, + "grad_norm": 0.9848154783248901, + "learning_rate": 5.79134841692379e-07, + "loss": 0.5971, + "step": 12678 + }, + { + "epoch": 4.68040856509968, + "grad_norm": 0.9796246290206909, + "learning_rate": 5.788235812774442e-07, + "loss": 0.598, + "step": 12679 + }, + { + "epoch": 4.680777750430716, + "grad_norm": 0.9785093069076538, + "learning_rate": 5.785123935784309e-07, + "loss": 0.5802, + "step": 12680 + }, + { + "epoch": 4.681146935761753, + "grad_norm": 0.9572718739509583, + "learning_rate": 5.782012786071162e-07, + "loss": 0.5629, + "step": 12681 + }, + { + "epoch": 4.6815161210927885, + "grad_norm": 0.9583417773246765, + "learning_rate": 5.778902363752775e-07, + "loss": 0.6098, + "step": 12682 + }, + { + "epoch": 4.681885306423824, + "grad_norm": 0.9485536813735962, + "learning_rate": 5.775792668946858e-07, + "loss": 0.5959, + "step": 12683 + }, + { + "epoch": 4.682254491754861, + "grad_norm": 0.9641349911689758, + "learning_rate": 5.772683701771136e-07, + "loss": 0.5636, + "step": 12684 + }, + { + "epoch": 4.682623677085897, + "grad_norm": 0.9671340584754944, + "learning_rate": 5.769575462343255e-07, + "loss": 0.5733, + "step": 12685 + }, + { + "epoch": 4.682992862416933, + "grad_norm": 0.9629853963851929, + "learning_rate": 5.766467950780872e-07, + "loss": 0.5409, + "step": 12686 + }, + { + "epoch": 4.68336204774797, + "grad_norm": 0.9873899817466736, + "learning_rate": 5.763361167201612e-07, + "loss": 0.5841, + "step": 12687 + }, + { + "epoch": 4.683731233079006, + "grad_norm": 0.972788393497467, + "learning_rate": 5.760255111723065e-07, + "loss": 0.5563, + "step": 12688 + }, + { + "epoch": 4.684100418410042, + "grad_norm": 0.9901572465896606, + "learning_rate": 5.757149784462788e-07, + "loss": 0.5506, + "step": 12689 + }, + { + "epoch": 4.6844696037410785, + "grad_norm": 0.9886318445205688, + "learning_rate": 5.754045185538309e-07, + "loss": 0.5934, + "step": 12690 + }, + { + "epoch": 4.684838789072114, + "grad_norm": 0.9860857129096985, + "learning_rate": 5.750941315067154e-07, + "loss": 0.5833, + "step": 12691 + }, + { + "epoch": 4.68520797440315, + "grad_norm": 0.9579238295555115, + "learning_rate": 5.747838173166797e-07, + "loss": 0.5531, + "step": 12692 + }, + { + "epoch": 4.685577159734186, + "grad_norm": 0.9691939353942871, + "learning_rate": 5.744735759954681e-07, + "loss": 0.6003, + "step": 12693 + }, + { + "epoch": 4.685946345065223, + "grad_norm": 0.9888288378715515, + "learning_rate": 5.741634075548247e-07, + "loss": 0.549, + "step": 12694 + }, + { + "epoch": 4.686315530396259, + "grad_norm": 0.9418739080429077, + "learning_rate": 5.738533120064876e-07, + "loss": 0.5338, + "step": 12695 + }, + { + "epoch": 4.686684715727295, + "grad_norm": 0.9517049193382263, + "learning_rate": 5.735432893621956e-07, + "loss": 0.5756, + "step": 12696 + }, + { + "epoch": 4.687053901058332, + "grad_norm": 0.947474479675293, + "learning_rate": 5.73233339633682e-07, + "loss": 0.5645, + "step": 12697 + }, + { + "epoch": 4.6874230863893676, + "grad_norm": 0.9668267369270325, + "learning_rate": 5.72923462832678e-07, + "loss": 0.5703, + "step": 12698 + }, + { + "epoch": 4.6877922717204035, + "grad_norm": 0.9431071877479553, + "learning_rate": 5.72613658970913e-07, + "loss": 0.5427, + "step": 12699 + }, + { + "epoch": 4.688161457051439, + "grad_norm": 0.9525395035743713, + "learning_rate": 5.72303928060113e-07, + "loss": 0.5619, + "step": 12700 + }, + { + "epoch": 4.688530642382476, + "grad_norm": 0.9655390977859497, + "learning_rate": 5.719942701120002e-07, + "loss": 0.5779, + "step": 12701 + }, + { + "epoch": 4.688899827713512, + "grad_norm": 0.9364953637123108, + "learning_rate": 5.716846851382965e-07, + "loss": 0.5739, + "step": 12702 + }, + { + "epoch": 4.689269013044548, + "grad_norm": 0.9496989846229553, + "learning_rate": 5.713751731507181e-07, + "loss": 0.5975, + "step": 12703 + }, + { + "epoch": 4.689638198375585, + "grad_norm": 0.9606320858001709, + "learning_rate": 5.710657341609821e-07, + "loss": 0.5735, + "step": 12704 + }, + { + "epoch": 4.690007383706621, + "grad_norm": 0.9398759007453918, + "learning_rate": 5.707563681807981e-07, + "loss": 0.5367, + "step": 12705 + }, + { + "epoch": 4.690376569037657, + "grad_norm": 0.92044997215271, + "learning_rate": 5.704470752218766e-07, + "loss": 0.5486, + "step": 12706 + }, + { + "epoch": 4.6907457543686935, + "grad_norm": 0.9558910727500916, + "learning_rate": 5.70137855295925e-07, + "loss": 0.5863, + "step": 12707 + }, + { + "epoch": 4.691114939699729, + "grad_norm": 0.9964326620101929, + "learning_rate": 5.698287084146464e-07, + "loss": 0.5985, + "step": 12708 + }, + { + "epoch": 4.691484125030765, + "grad_norm": 1.0079420804977417, + "learning_rate": 5.695196345897419e-07, + "loss": 0.5818, + "step": 12709 + }, + { + "epoch": 4.691853310361802, + "grad_norm": 1.0825700759887695, + "learning_rate": 5.692106338329095e-07, + "loss": 0.5821, + "step": 12710 + }, + { + "epoch": 4.692222495692838, + "grad_norm": 0.9646375179290771, + "learning_rate": 5.689017061558458e-07, + "loss": 0.5452, + "step": 12711 + }, + { + "epoch": 4.692591681023874, + "grad_norm": 0.9429869055747986, + "learning_rate": 5.685928515702433e-07, + "loss": 0.5954, + "step": 12712 + }, + { + "epoch": 4.69296086635491, + "grad_norm": 0.9723039269447327, + "learning_rate": 5.682840700877909e-07, + "loss": 0.555, + "step": 12713 + }, + { + "epoch": 4.693330051685947, + "grad_norm": 0.974105179309845, + "learning_rate": 5.679753617201769e-07, + "loss": 0.5724, + "step": 12714 + }, + { + "epoch": 4.6936992370169826, + "grad_norm": 0.9367263317108154, + "learning_rate": 5.67666726479086e-07, + "loss": 0.576, + "step": 12715 + }, + { + "epoch": 4.6940684223480185, + "grad_norm": 0.9482772946357727, + "learning_rate": 5.673581643762002e-07, + "loss": 0.5949, + "step": 12716 + }, + { + "epoch": 4.694437607679055, + "grad_norm": 0.9747699499130249, + "learning_rate": 5.670496754231977e-07, + "loss": 0.5853, + "step": 12717 + }, + { + "epoch": 4.694806793010091, + "grad_norm": 0.9469984769821167, + "learning_rate": 5.667412596317542e-07, + "loss": 0.5702, + "step": 12718 + }, + { + "epoch": 4.695175978341127, + "grad_norm": 0.9871953129768372, + "learning_rate": 5.664329170135447e-07, + "loss": 0.5921, + "step": 12719 + }, + { + "epoch": 4.695545163672163, + "grad_norm": 0.9885798096656799, + "learning_rate": 5.66124647580239e-07, + "loss": 0.6006, + "step": 12720 + }, + { + "epoch": 4.6959143490032, + "grad_norm": 0.9754877686500549, + "learning_rate": 5.658164513435044e-07, + "loss": 0.5835, + "step": 12721 + }, + { + "epoch": 4.696283534334236, + "grad_norm": 0.9640277624130249, + "learning_rate": 5.655083283150073e-07, + "loss": 0.5865, + "step": 12722 + }, + { + "epoch": 4.696652719665272, + "grad_norm": 0.9449736475944519, + "learning_rate": 5.652002785064087e-07, + "loss": 0.5909, + "step": 12723 + }, + { + "epoch": 4.6970219049963085, + "grad_norm": 0.986321210861206, + "learning_rate": 5.648923019293695e-07, + "loss": 0.5825, + "step": 12724 + }, + { + "epoch": 4.697391090327344, + "grad_norm": 0.9340388774871826, + "learning_rate": 5.645843985955462e-07, + "loss": 0.5615, + "step": 12725 + }, + { + "epoch": 4.69776027565838, + "grad_norm": 0.9489978551864624, + "learning_rate": 5.642765685165918e-07, + "loss": 0.5549, + "step": 12726 + }, + { + "epoch": 4.698129460989417, + "grad_norm": 0.9618688225746155, + "learning_rate": 5.639688117041589e-07, + "loss": 0.5797, + "step": 12727 + }, + { + "epoch": 4.698498646320453, + "grad_norm": 0.94565749168396, + "learning_rate": 5.636611281698956e-07, + "loss": 0.5377, + "step": 12728 + }, + { + "epoch": 4.698867831651489, + "grad_norm": 0.9735034704208374, + "learning_rate": 5.633535179254468e-07, + "loss": 0.5837, + "step": 12729 + }, + { + "epoch": 4.699237016982526, + "grad_norm": 1.032600998878479, + "learning_rate": 5.630459809824571e-07, + "loss": 0.5666, + "step": 12730 + }, + { + "epoch": 4.699606202313562, + "grad_norm": 0.979930579662323, + "learning_rate": 5.627385173525652e-07, + "loss": 0.5471, + "step": 12731 + }, + { + "epoch": 4.6999753876445975, + "grad_norm": 0.9545853137969971, + "learning_rate": 5.624311270474092e-07, + "loss": 0.5676, + "step": 12732 + }, + { + "epoch": 4.7003445729756335, + "grad_norm": 0.9438894987106323, + "learning_rate": 5.62123810078623e-07, + "loss": 0.5504, + "step": 12733 + }, + { + "epoch": 4.70071375830667, + "grad_norm": 0.9425113797187805, + "learning_rate": 5.618165664578387e-07, + "loss": 0.5783, + "step": 12734 + }, + { + "epoch": 4.701082943637706, + "grad_norm": 0.9320715069770813, + "learning_rate": 5.615093961966867e-07, + "loss": 0.5644, + "step": 12735 + }, + { + "epoch": 4.701452128968742, + "grad_norm": 0.9439231753349304, + "learning_rate": 5.612022993067919e-07, + "loss": 0.5598, + "step": 12736 + }, + { + "epoch": 4.701821314299778, + "grad_norm": 0.9493051171302795, + "learning_rate": 5.608952757997785e-07, + "loss": 0.5906, + "step": 12737 + }, + { + "epoch": 4.702190499630815, + "grad_norm": 0.9765157699584961, + "learning_rate": 5.60588325687266e-07, + "loss": 0.5882, + "step": 12738 + }, + { + "epoch": 4.702559684961851, + "grad_norm": 0.9577839970588684, + "learning_rate": 5.602814489808742e-07, + "loss": 0.5781, + "step": 12739 + }, + { + "epoch": 4.702928870292887, + "grad_norm": 0.9618726968765259, + "learning_rate": 5.59974645692217e-07, + "loss": 0.5707, + "step": 12740 + }, + { + "epoch": 4.7032980556239234, + "grad_norm": 1.0010968446731567, + "learning_rate": 5.596679158329069e-07, + "loss": 0.5724, + "step": 12741 + }, + { + "epoch": 4.703667240954959, + "grad_norm": 0.9204011559486389, + "learning_rate": 5.593612594145542e-07, + "loss": 0.5836, + "step": 12742 + }, + { + "epoch": 4.704036426285995, + "grad_norm": 0.9731946587562561, + "learning_rate": 5.590546764487647e-07, + "loss": 0.5897, + "step": 12743 + }, + { + "epoch": 4.704405611617032, + "grad_norm": 0.9527516961097717, + "learning_rate": 5.587481669471439e-07, + "loss": 0.5384, + "step": 12744 + }, + { + "epoch": 4.704774796948068, + "grad_norm": 0.9757479429244995, + "learning_rate": 5.584417309212925e-07, + "loss": 0.5779, + "step": 12745 + }, + { + "epoch": 4.705143982279104, + "grad_norm": 0.9514113068580627, + "learning_rate": 5.581353683828078e-07, + "loss": 0.579, + "step": 12746 + }, + { + "epoch": 4.705513167610141, + "grad_norm": 0.9574370980262756, + "learning_rate": 5.578290793432875e-07, + "loss": 0.6346, + "step": 12747 + }, + { + "epoch": 4.705882352941177, + "grad_norm": 0.9542053937911987, + "learning_rate": 5.575228638143235e-07, + "loss": 0.5686, + "step": 12748 + }, + { + "epoch": 4.7062515382722125, + "grad_norm": 0.9878913760185242, + "learning_rate": 5.572167218075053e-07, + "loss": 0.5924, + "step": 12749 + }, + { + "epoch": 4.7066207236032485, + "grad_norm": 0.9864073991775513, + "learning_rate": 5.569106533344215e-07, + "loss": 0.5823, + "step": 12750 + }, + { + "epoch": 4.706989908934285, + "grad_norm": 0.9787482619285583, + "learning_rate": 5.566046584066568e-07, + "loss": 0.5783, + "step": 12751 + }, + { + "epoch": 4.707359094265321, + "grad_norm": 0.9527290463447571, + "learning_rate": 5.562987370357911e-07, + "loss": 0.5494, + "step": 12752 + }, + { + "epoch": 4.707728279596357, + "grad_norm": 0.9365574717521667, + "learning_rate": 5.55992889233406e-07, + "loss": 0.6194, + "step": 12753 + }, + { + "epoch": 4.708097464927394, + "grad_norm": 1.0258007049560547, + "learning_rate": 5.556871150110755e-07, + "loss": 0.5741, + "step": 12754 + }, + { + "epoch": 4.70846665025843, + "grad_norm": 0.934313952922821, + "learning_rate": 5.553814143803749e-07, + "loss": 0.5405, + "step": 12755 + }, + { + "epoch": 4.708835835589466, + "grad_norm": 0.9545559287071228, + "learning_rate": 5.55075787352874e-07, + "loss": 0.5918, + "step": 12756 + }, + { + "epoch": 4.709205020920502, + "grad_norm": 0.9537572860717773, + "learning_rate": 5.547702339401397e-07, + "loss": 0.586, + "step": 12757 + }, + { + "epoch": 4.709574206251538, + "grad_norm": 0.9682947993278503, + "learning_rate": 5.544647541537393e-07, + "loss": 0.5674, + "step": 12758 + }, + { + "epoch": 4.709943391582574, + "grad_norm": 0.9483765959739685, + "learning_rate": 5.541593480052338e-07, + "loss": 0.6034, + "step": 12759 + }, + { + "epoch": 4.71031257691361, + "grad_norm": 0.9704323410987854, + "learning_rate": 5.538540155061825e-07, + "loss": 0.5617, + "step": 12760 + }, + { + "epoch": 4.710681762244647, + "grad_norm": 0.9625968337059021, + "learning_rate": 5.535487566681421e-07, + "loss": 0.5716, + "step": 12761 + }, + { + "epoch": 4.711050947575683, + "grad_norm": 0.9650824666023254, + "learning_rate": 5.532435715026671e-07, + "loss": 0.5775, + "step": 12762 + }, + { + "epoch": 4.711420132906719, + "grad_norm": 0.9640879034996033, + "learning_rate": 5.529384600213092e-07, + "loss": 0.5818, + "step": 12763 + }, + { + "epoch": 4.711789318237756, + "grad_norm": 0.945060670375824, + "learning_rate": 5.526334222356158e-07, + "loss": 0.5579, + "step": 12764 + }, + { + "epoch": 4.712158503568792, + "grad_norm": 0.9289738535881042, + "learning_rate": 5.523284581571329e-07, + "loss": 0.5858, + "step": 12765 + }, + { + "epoch": 4.7125276888998275, + "grad_norm": 0.9728955030441284, + "learning_rate": 5.520235677974025e-07, + "loss": 0.5844, + "step": 12766 + }, + { + "epoch": 4.712896874230864, + "grad_norm": 0.9749798774719238, + "learning_rate": 5.517187511679659e-07, + "loss": 0.6102, + "step": 12767 + }, + { + "epoch": 4.7132660595619, + "grad_norm": 0.9724915027618408, + "learning_rate": 5.514140082803599e-07, + "loss": 0.6018, + "step": 12768 + }, + { + "epoch": 4.713635244892936, + "grad_norm": 0.9427294731140137, + "learning_rate": 5.511093391461178e-07, + "loss": 0.5432, + "step": 12769 + }, + { + "epoch": 4.714004430223972, + "grad_norm": 0.9289034605026245, + "learning_rate": 5.508047437767728e-07, + "loss": 0.5479, + "step": 12770 + }, + { + "epoch": 4.714373615555009, + "grad_norm": 0.9767047762870789, + "learning_rate": 5.505002221838532e-07, + "loss": 0.6077, + "step": 12771 + }, + { + "epoch": 4.714742800886045, + "grad_norm": 0.9625318646430969, + "learning_rate": 5.501957743788841e-07, + "loss": 0.556, + "step": 12772 + }, + { + "epoch": 4.715111986217081, + "grad_norm": 0.9718109369277954, + "learning_rate": 5.498914003733905e-07, + "loss": 0.5675, + "step": 12773 + }, + { + "epoch": 4.7154811715481175, + "grad_norm": 0.9470216035842896, + "learning_rate": 5.49587100178891e-07, + "loss": 0.5637, + "step": 12774 + }, + { + "epoch": 4.715850356879153, + "grad_norm": 0.9551165699958801, + "learning_rate": 5.492828738069048e-07, + "loss": 0.5683, + "step": 12775 + }, + { + "epoch": 4.716219542210189, + "grad_norm": 0.9260233044624329, + "learning_rate": 5.489787212689466e-07, + "loss": 0.5833, + "step": 12776 + }, + { + "epoch": 4.716588727541225, + "grad_norm": 0.9744818210601807, + "learning_rate": 5.486746425765269e-07, + "loss": 0.5675, + "step": 12777 + }, + { + "epoch": 4.716957912872262, + "grad_norm": 0.9616488218307495, + "learning_rate": 5.483706377411571e-07, + "loss": 0.5897, + "step": 12778 + }, + { + "epoch": 4.717327098203298, + "grad_norm": 0.944340169429779, + "learning_rate": 5.480667067743425e-07, + "loss": 0.5338, + "step": 12779 + }, + { + "epoch": 4.717696283534334, + "grad_norm": 0.9735473990440369, + "learning_rate": 5.477628496875872e-07, + "loss": 0.579, + "step": 12780 + }, + { + "epoch": 4.718065468865371, + "grad_norm": 0.924416184425354, + "learning_rate": 5.474590664923913e-07, + "loss": 0.5221, + "step": 12781 + }, + { + "epoch": 4.718434654196407, + "grad_norm": 0.944614052772522, + "learning_rate": 5.471553572002533e-07, + "loss": 0.5704, + "step": 12782 + }, + { + "epoch": 4.7188038395274425, + "grad_norm": 0.9580678343772888, + "learning_rate": 5.468517218226693e-07, + "loss": 0.608, + "step": 12783 + }, + { + "epoch": 4.719173024858479, + "grad_norm": 0.9958012104034424, + "learning_rate": 5.465481603711315e-07, + "loss": 0.5588, + "step": 12784 + }, + { + "epoch": 4.719542210189515, + "grad_norm": 0.9548967480659485, + "learning_rate": 5.462446728571292e-07, + "loss": 0.5513, + "step": 12785 + }, + { + "epoch": 4.719911395520551, + "grad_norm": 0.9375609755516052, + "learning_rate": 5.459412592921486e-07, + "loss": 0.5604, + "step": 12786 + }, + { + "epoch": 4.720280580851588, + "grad_norm": 0.9445386528968811, + "learning_rate": 5.456379196876755e-07, + "loss": 0.5742, + "step": 12787 + }, + { + "epoch": 4.720649766182624, + "grad_norm": 0.9556133151054382, + "learning_rate": 5.453346540551901e-07, + "loss": 0.6135, + "step": 12788 + }, + { + "epoch": 4.72101895151366, + "grad_norm": 0.9593240022659302, + "learning_rate": 5.450314624061706e-07, + "loss": 0.5543, + "step": 12789 + }, + { + "epoch": 4.721388136844696, + "grad_norm": 0.9774720072746277, + "learning_rate": 5.447283447520932e-07, + "loss": 0.559, + "step": 12790 + }, + { + "epoch": 4.7217573221757325, + "grad_norm": 0.9329657554626465, + "learning_rate": 5.444253011044323e-07, + "loss": 0.5914, + "step": 12791 + }, + { + "epoch": 4.722126507506768, + "grad_norm": 0.957624614238739, + "learning_rate": 5.441223314746553e-07, + "loss": 0.5302, + "step": 12792 + }, + { + "epoch": 4.722495692837804, + "grad_norm": 0.9708669781684875, + "learning_rate": 5.438194358742314e-07, + "loss": 0.6009, + "step": 12793 + }, + { + "epoch": 4.72286487816884, + "grad_norm": 0.968586266040802, + "learning_rate": 5.435166143146237e-07, + "loss": 0.5883, + "step": 12794 + }, + { + "epoch": 4.723234063499877, + "grad_norm": 0.9477389454841614, + "learning_rate": 5.432138668072953e-07, + "loss": 0.5726, + "step": 12795 + }, + { + "epoch": 4.723603248830913, + "grad_norm": 0.9792276620864868, + "learning_rate": 5.429111933637046e-07, + "loss": 0.5938, + "step": 12796 + }, + { + "epoch": 4.723972434161949, + "grad_norm": 0.9638069868087769, + "learning_rate": 5.426085939953065e-07, + "loss": 0.5845, + "step": 12797 + }, + { + "epoch": 4.724341619492986, + "grad_norm": 0.9651623964309692, + "learning_rate": 5.423060687135564e-07, + "loss": 0.5389, + "step": 12798 + }, + { + "epoch": 4.724710804824022, + "grad_norm": 0.9603493809700012, + "learning_rate": 5.42003617529904e-07, + "loss": 0.5869, + "step": 12799 + }, + { + "epoch": 4.7250799901550575, + "grad_norm": 0.9980363845825195, + "learning_rate": 5.417012404557956e-07, + "loss": 0.5475, + "step": 12800 + }, + { + "epoch": 4.725449175486094, + "grad_norm": 0.9244535565376282, + "learning_rate": 5.413989375026779e-07, + "loss": 0.5763, + "step": 12801 + }, + { + "epoch": 4.72581836081713, + "grad_norm": 1.2097127437591553, + "learning_rate": 5.410967086819918e-07, + "loss": 0.5661, + "step": 12802 + }, + { + "epoch": 4.726187546148166, + "grad_norm": 0.943710207939148, + "learning_rate": 5.407945540051776e-07, + "loss": 0.5584, + "step": 12803 + }, + { + "epoch": 4.726556731479203, + "grad_norm": 0.9617297053337097, + "learning_rate": 5.404924734836714e-07, + "loss": 0.5894, + "step": 12804 + }, + { + "epoch": 4.726925916810239, + "grad_norm": 0.9578562378883362, + "learning_rate": 5.401904671289057e-07, + "loss": 0.5613, + "step": 12805 + }, + { + "epoch": 4.727295102141275, + "grad_norm": 0.9503665566444397, + "learning_rate": 5.39888534952313e-07, + "loss": 0.5563, + "step": 12806 + }, + { + "epoch": 4.727664287472312, + "grad_norm": 0.9645100235939026, + "learning_rate": 5.395866769653207e-07, + "loss": 0.5541, + "step": 12807 + }, + { + "epoch": 4.7280334728033475, + "grad_norm": 0.9448662400245667, + "learning_rate": 5.39284893179354e-07, + "loss": 0.5856, + "step": 12808 + }, + { + "epoch": 4.728402658134383, + "grad_norm": 0.9349389672279358, + "learning_rate": 5.389831836058345e-07, + "loss": 0.6172, + "step": 12809 + }, + { + "epoch": 4.728771843465419, + "grad_norm": 0.9749547243118286, + "learning_rate": 5.386815482561828e-07, + "loss": 0.518, + "step": 12810 + }, + { + "epoch": 4.729141028796456, + "grad_norm": 0.9378690123558044, + "learning_rate": 5.383799871418166e-07, + "loss": 0.6265, + "step": 12811 + }, + { + "epoch": 4.729510214127492, + "grad_norm": 0.9607113003730774, + "learning_rate": 5.380785002741476e-07, + "loss": 0.5923, + "step": 12812 + }, + { + "epoch": 4.729879399458528, + "grad_norm": 0.9466685056686401, + "learning_rate": 5.377770876645888e-07, + "loss": 0.5755, + "step": 12813 + }, + { + "epoch": 4.730248584789564, + "grad_norm": 0.9588108658790588, + "learning_rate": 5.374757493245472e-07, + "loss": 0.5522, + "step": 12814 + }, + { + "epoch": 4.730617770120601, + "grad_norm": 0.9741490483283997, + "learning_rate": 5.371744852654301e-07, + "loss": 0.5817, + "step": 12815 + }, + { + "epoch": 4.730986955451637, + "grad_norm": 0.9429228901863098, + "learning_rate": 5.368732954986389e-07, + "loss": 0.5842, + "step": 12816 + }, + { + "epoch": 4.7313561407826725, + "grad_norm": 0.9753550887107849, + "learning_rate": 5.365721800355733e-07, + "loss": 0.5599, + "step": 12817 + }, + { + "epoch": 4.731725326113709, + "grad_norm": 0.972743034362793, + "learning_rate": 5.362711388876318e-07, + "loss": 0.5703, + "step": 12818 + }, + { + "epoch": 4.732094511444745, + "grad_norm": 0.9966688752174377, + "learning_rate": 5.359701720662077e-07, + "loss": 0.5417, + "step": 12819 + }, + { + "epoch": 4.732463696775781, + "grad_norm": 0.9737197160720825, + "learning_rate": 5.356692795826923e-07, + "loss": 0.5846, + "step": 12820 + }, + { + "epoch": 4.732832882106818, + "grad_norm": 0.9960848689079285, + "learning_rate": 5.353684614484753e-07, + "loss": 0.5836, + "step": 12821 + }, + { + "epoch": 4.733202067437854, + "grad_norm": 0.9566304683685303, + "learning_rate": 5.350677176749413e-07, + "loss": 0.5788, + "step": 12822 + }, + { + "epoch": 4.73357125276889, + "grad_norm": 0.9702402353286743, + "learning_rate": 5.347670482734746e-07, + "loss": 0.6352, + "step": 12823 + }, + { + "epoch": 4.733940438099927, + "grad_norm": 0.9729029536247253, + "learning_rate": 5.344664532554553e-07, + "loss": 0.5507, + "step": 12824 + }, + { + "epoch": 4.7343096234309625, + "grad_norm": 0.9504541754722595, + "learning_rate": 5.341659326322593e-07, + "loss": 0.5615, + "step": 12825 + }, + { + "epoch": 4.734678808761998, + "grad_norm": 0.9555990695953369, + "learning_rate": 5.338654864152631e-07, + "loss": 0.548, + "step": 12826 + }, + { + "epoch": 4.735047994093034, + "grad_norm": 0.9719330668449402, + "learning_rate": 5.335651146158377e-07, + "loss": 0.5297, + "step": 12827 + }, + { + "epoch": 4.735417179424071, + "grad_norm": 0.9248902201652527, + "learning_rate": 5.332648172453517e-07, + "loss": 0.5606, + "step": 12828 + }, + { + "epoch": 4.735786364755107, + "grad_norm": 0.9912894368171692, + "learning_rate": 5.329645943151721e-07, + "loss": 0.6114, + "step": 12829 + }, + { + "epoch": 4.736155550086143, + "grad_norm": 0.9554056525230408, + "learning_rate": 5.326644458366611e-07, + "loss": 0.5902, + "step": 12830 + }, + { + "epoch": 4.73652473541718, + "grad_norm": 0.9660685658454895, + "learning_rate": 5.323643718211813e-07, + "loss": 0.591, + "step": 12831 + }, + { + "epoch": 4.736893920748216, + "grad_norm": 0.9850864410400391, + "learning_rate": 5.320643722800877e-07, + "loss": 0.555, + "step": 12832 + }, + { + "epoch": 4.737263106079252, + "grad_norm": 0.9172880053520203, + "learning_rate": 5.317644472247366e-07, + "loss": 0.5554, + "step": 12833 + }, + { + "epoch": 4.7376322914102875, + "grad_norm": 0.9819920063018799, + "learning_rate": 5.314645966664806e-07, + "loss": 0.5891, + "step": 12834 + }, + { + "epoch": 4.738001476741324, + "grad_norm": 0.9631838202476501, + "learning_rate": 5.311648206166681e-07, + "loss": 0.5477, + "step": 12835 + }, + { + "epoch": 4.73837066207236, + "grad_norm": 1.0106115341186523, + "learning_rate": 5.308651190866459e-07, + "loss": 0.5907, + "step": 12836 + }, + { + "epoch": 4.738739847403396, + "grad_norm": 0.9493407607078552, + "learning_rate": 5.305654920877571e-07, + "loss": 0.6106, + "step": 12837 + }, + { + "epoch": 4.739109032734433, + "grad_norm": 0.9717087149620056, + "learning_rate": 5.302659396313425e-07, + "loss": 0.5786, + "step": 12838 + }, + { + "epoch": 4.739478218065469, + "grad_norm": 0.9548754692077637, + "learning_rate": 5.29966461728742e-07, + "loss": 0.5653, + "step": 12839 + }, + { + "epoch": 4.739847403396505, + "grad_norm": 0.9866426587104797, + "learning_rate": 5.296670583912877e-07, + "loss": 0.5804, + "step": 12840 + }, + { + "epoch": 4.740216588727542, + "grad_norm": 0.9669961929321289, + "learning_rate": 5.293677296303141e-07, + "loss": 0.5777, + "step": 12841 + }, + { + "epoch": 4.7405857740585775, + "grad_norm": 0.9928221702575684, + "learning_rate": 5.290684754571493e-07, + "loss": 0.5612, + "step": 12842 + }, + { + "epoch": 4.740954959389613, + "grad_norm": 0.9325243234634399, + "learning_rate": 5.287692958831214e-07, + "loss": 0.5623, + "step": 12843 + }, + { + "epoch": 4.74132414472065, + "grad_norm": 0.9548503756523132, + "learning_rate": 5.284701909195535e-07, + "loss": 0.6027, + "step": 12844 + }, + { + "epoch": 4.741693330051686, + "grad_norm": 0.939248263835907, + "learning_rate": 5.281711605777659e-07, + "loss": 0.5752, + "step": 12845 + }, + { + "epoch": 4.742062515382722, + "grad_norm": 0.950762927532196, + "learning_rate": 5.278722048690785e-07, + "loss": 0.5667, + "step": 12846 + }, + { + "epoch": 4.742431700713758, + "grad_norm": 0.9711207151412964, + "learning_rate": 5.275733238048053e-07, + "loss": 0.5631, + "step": 12847 + }, + { + "epoch": 4.742800886044795, + "grad_norm": 0.952895998954773, + "learning_rate": 5.27274517396259e-07, + "loss": 0.606, + "step": 12848 + }, + { + "epoch": 4.743170071375831, + "grad_norm": 0.9648547768592834, + "learning_rate": 5.2697578565475e-07, + "loss": 0.5478, + "step": 12849 + }, + { + "epoch": 4.743539256706867, + "grad_norm": 0.937259316444397, + "learning_rate": 5.266771285915844e-07, + "loss": 0.5676, + "step": 12850 + }, + { + "epoch": 4.743908442037903, + "grad_norm": 0.980124294757843, + "learning_rate": 5.263785462180673e-07, + "loss": 0.5694, + "step": 12851 + }, + { + "epoch": 4.744277627368939, + "grad_norm": 0.9737994074821472, + "learning_rate": 5.260800385454992e-07, + "loss": 0.5327, + "step": 12852 + }, + { + "epoch": 4.744646812699975, + "grad_norm": 0.9359907507896423, + "learning_rate": 5.257816055851781e-07, + "loss": 0.559, + "step": 12853 + }, + { + "epoch": 4.745015998031011, + "grad_norm": 0.983676016330719, + "learning_rate": 5.254832473484008e-07, + "loss": 0.583, + "step": 12854 + }, + { + "epoch": 4.745385183362048, + "grad_norm": 0.9530048966407776, + "learning_rate": 5.251849638464592e-07, + "loss": 0.5759, + "step": 12855 + }, + { + "epoch": 4.745754368693084, + "grad_norm": 0.9472476243972778, + "learning_rate": 5.248867550906439e-07, + "loss": 0.5879, + "step": 12856 + }, + { + "epoch": 4.74612355402412, + "grad_norm": 0.974635124206543, + "learning_rate": 5.245886210922408e-07, + "loss": 0.5811, + "step": 12857 + }, + { + "epoch": 4.746492739355157, + "grad_norm": 0.9272179007530212, + "learning_rate": 5.242905618625347e-07, + "loss": 0.5712, + "step": 12858 + }, + { + "epoch": 4.7468619246861925, + "grad_norm": 0.9332128167152405, + "learning_rate": 5.239925774128088e-07, + "loss": 0.5812, + "step": 12859 + }, + { + "epoch": 4.747231110017228, + "grad_norm": 0.9560152292251587, + "learning_rate": 5.236946677543386e-07, + "loss": 0.5642, + "step": 12860 + }, + { + "epoch": 4.747600295348265, + "grad_norm": 0.9680345058441162, + "learning_rate": 5.233968328984018e-07, + "loss": 0.5773, + "step": 12861 + }, + { + "epoch": 4.747969480679301, + "grad_norm": 0.9795017838478088, + "learning_rate": 5.230990728562716e-07, + "loss": 0.5794, + "step": 12862 + }, + { + "epoch": 4.748338666010337, + "grad_norm": 0.947626531124115, + "learning_rate": 5.228013876392177e-07, + "loss": 0.5562, + "step": 12863 + }, + { + "epoch": 4.748707851341374, + "grad_norm": 0.974602222442627, + "learning_rate": 5.225037772585071e-07, + "loss": 0.5513, + "step": 12864 + }, + { + "epoch": 4.74907703667241, + "grad_norm": 0.948188841342926, + "learning_rate": 5.222062417254037e-07, + "loss": 0.5698, + "step": 12865 + }, + { + "epoch": 4.749446222003446, + "grad_norm": 0.9947631359100342, + "learning_rate": 5.2190878105117e-07, + "loss": 0.5892, + "step": 12866 + }, + { + "epoch": 4.749815407334482, + "grad_norm": 0.9606334567070007, + "learning_rate": 5.216113952470661e-07, + "loss": 0.5738, + "step": 12867 + }, + { + "epoch": 4.750184592665518, + "grad_norm": 0.9608035087585449, + "learning_rate": 5.21314084324345e-07, + "loss": 0.5805, + "step": 12868 + }, + { + "epoch": 4.750553777996554, + "grad_norm": 0.9582058787345886, + "learning_rate": 5.210168482942618e-07, + "loss": 0.6024, + "step": 12869 + }, + { + "epoch": 4.75092296332759, + "grad_norm": 0.9818593859672546, + "learning_rate": 5.207196871680658e-07, + "loss": 0.6191, + "step": 12870 + }, + { + "epoch": 4.751292148658626, + "grad_norm": 0.9773924946784973, + "learning_rate": 5.204226009570059e-07, + "loss": 0.594, + "step": 12871 + }, + { + "epoch": 4.751661333989663, + "grad_norm": 0.9664655923843384, + "learning_rate": 5.201255896723256e-07, + "loss": 0.5147, + "step": 12872 + }, + { + "epoch": 4.752030519320699, + "grad_norm": 0.946702778339386, + "learning_rate": 5.198286533252661e-07, + "loss": 0.5792, + "step": 12873 + }, + { + "epoch": 4.752399704651735, + "grad_norm": 0.9757911562919617, + "learning_rate": 5.195317919270679e-07, + "loss": 0.5802, + "step": 12874 + }, + { + "epoch": 4.752768889982772, + "grad_norm": 0.9576503038406372, + "learning_rate": 5.192350054889664e-07, + "loss": 0.572, + "step": 12875 + }, + { + "epoch": 4.7531380753138075, + "grad_norm": 0.9780775308609009, + "learning_rate": 5.189382940221941e-07, + "loss": 0.594, + "step": 12876 + }, + { + "epoch": 4.753507260644843, + "grad_norm": 0.9330947399139404, + "learning_rate": 5.186416575379827e-07, + "loss": 0.5488, + "step": 12877 + }, + { + "epoch": 4.75387644597588, + "grad_norm": 0.9537791609764099, + "learning_rate": 5.183450960475587e-07, + "loss": 0.5524, + "step": 12878 + }, + { + "epoch": 4.754245631306916, + "grad_norm": 0.9647408723831177, + "learning_rate": 5.180486095621484e-07, + "loss": 0.5607, + "step": 12879 + }, + { + "epoch": 4.754614816637952, + "grad_norm": 1.0236185789108276, + "learning_rate": 5.177521980929717e-07, + "loss": 0.591, + "step": 12880 + }, + { + "epoch": 4.754984001968989, + "grad_norm": 0.9642900824546814, + "learning_rate": 5.174558616512485e-07, + "loss": 0.5743, + "step": 12881 + }, + { + "epoch": 4.755353187300025, + "grad_norm": 0.9482384324073792, + "learning_rate": 5.17159600248196e-07, + "loss": 0.5728, + "step": 12882 + }, + { + "epoch": 4.755722372631061, + "grad_norm": 0.9547712802886963, + "learning_rate": 5.168634138950271e-07, + "loss": 0.5375, + "step": 12883 + }, + { + "epoch": 4.756091557962097, + "grad_norm": 0.9417369365692139, + "learning_rate": 5.165673026029516e-07, + "loss": 0.5434, + "step": 12884 + }, + { + "epoch": 4.756460743293133, + "grad_norm": 0.9326373338699341, + "learning_rate": 5.162712663831774e-07, + "loss": 0.5532, + "step": 12885 + }, + { + "epoch": 4.756829928624169, + "grad_norm": 0.9768733382225037, + "learning_rate": 5.159753052469097e-07, + "loss": 0.541, + "step": 12886 + }, + { + "epoch": 4.757199113955205, + "grad_norm": 0.9871580004692078, + "learning_rate": 5.156794192053516e-07, + "loss": 0.5685, + "step": 12887 + }, + { + "epoch": 4.757568299286242, + "grad_norm": 0.9756630659103394, + "learning_rate": 5.153836082697001e-07, + "loss": 0.6069, + "step": 12888 + }, + { + "epoch": 4.757937484617278, + "grad_norm": 1.081632137298584, + "learning_rate": 5.150878724511533e-07, + "loss": 0.5775, + "step": 12889 + }, + { + "epoch": 4.758306669948314, + "grad_norm": 0.9905683398246765, + "learning_rate": 5.147922117609033e-07, + "loss": 0.6027, + "step": 12890 + }, + { + "epoch": 4.75867585527935, + "grad_norm": 0.9759812951087952, + "learning_rate": 5.144966262101425e-07, + "loss": 0.5792, + "step": 12891 + }, + { + "epoch": 4.759045040610387, + "grad_norm": 0.9712845683097839, + "learning_rate": 5.142011158100574e-07, + "loss": 0.599, + "step": 12892 + }, + { + "epoch": 4.7594142259414225, + "grad_norm": 1.0147334337234497, + "learning_rate": 5.139056805718329e-07, + "loss": 0.5549, + "step": 12893 + }, + { + "epoch": 4.759783411272458, + "grad_norm": 0.9662838578224182, + "learning_rate": 5.136103205066522e-07, + "loss": 0.5698, + "step": 12894 + }, + { + "epoch": 4.760152596603495, + "grad_norm": 0.9443753957748413, + "learning_rate": 5.133150356256939e-07, + "loss": 0.5647, + "step": 12895 + }, + { + "epoch": 4.760521781934531, + "grad_norm": 0.9934337735176086, + "learning_rate": 5.13019825940134e-07, + "loss": 0.579, + "step": 12896 + }, + { + "epoch": 4.760890967265567, + "grad_norm": 0.9639095664024353, + "learning_rate": 5.127246914611469e-07, + "loss": 0.5469, + "step": 12897 + }, + { + "epoch": 4.761260152596604, + "grad_norm": 1.0003960132598877, + "learning_rate": 5.124296321999028e-07, + "loss": 0.6194, + "step": 12898 + }, + { + "epoch": 4.76162933792764, + "grad_norm": 0.9626024961471558, + "learning_rate": 5.121346481675702e-07, + "loss": 0.5814, + "step": 12899 + }, + { + "epoch": 4.761998523258676, + "grad_norm": 0.9583026170730591, + "learning_rate": 5.118397393753138e-07, + "loss": 0.5925, + "step": 12900 + }, + { + "epoch": 4.7623677085897125, + "grad_norm": 0.9807982444763184, + "learning_rate": 5.115449058342953e-07, + "loss": 0.5695, + "step": 12901 + }, + { + "epoch": 4.762736893920748, + "grad_norm": 0.9471434354782104, + "learning_rate": 5.11250147555675e-07, + "loss": 0.5561, + "step": 12902 + }, + { + "epoch": 4.763106079251784, + "grad_norm": 0.9732187986373901, + "learning_rate": 5.109554645506093e-07, + "loss": 0.5656, + "step": 12903 + }, + { + "epoch": 4.76347526458282, + "grad_norm": 0.9477128982543945, + "learning_rate": 5.106608568302504e-07, + "loss": 0.5574, + "step": 12904 + }, + { + "epoch": 4.763844449913857, + "grad_norm": 0.936545729637146, + "learning_rate": 5.103663244057511e-07, + "loss": 0.5582, + "step": 12905 + }, + { + "epoch": 4.764213635244893, + "grad_norm": 0.9762539267539978, + "learning_rate": 5.100718672882579e-07, + "loss": 0.6055, + "step": 12906 + }, + { + "epoch": 4.764582820575929, + "grad_norm": 0.9656641483306885, + "learning_rate": 5.097774854889178e-07, + "loss": 0.5637, + "step": 12907 + }, + { + "epoch": 4.764952005906966, + "grad_norm": 0.9469454884529114, + "learning_rate": 5.094831790188703e-07, + "loss": 0.5414, + "step": 12908 + }, + { + "epoch": 4.765321191238002, + "grad_norm": 0.9874356389045715, + "learning_rate": 5.091889478892564e-07, + "loss": 0.5393, + "step": 12909 + }, + { + "epoch": 4.7656903765690375, + "grad_norm": 0.9328457117080688, + "learning_rate": 5.088947921112131e-07, + "loss": 0.5761, + "step": 12910 + }, + { + "epoch": 4.766059561900073, + "grad_norm": 0.949308454990387, + "learning_rate": 5.086007116958736e-07, + "loss": 0.5413, + "step": 12911 + }, + { + "epoch": 4.76642874723111, + "grad_norm": 0.9297245144844055, + "learning_rate": 5.083067066543685e-07, + "loss": 0.568, + "step": 12912 + }, + { + "epoch": 4.766797932562146, + "grad_norm": 0.9645878076553345, + "learning_rate": 5.080127769978255e-07, + "loss": 0.5716, + "step": 12913 + }, + { + "epoch": 4.767167117893182, + "grad_norm": 0.940948486328125, + "learning_rate": 5.0771892273737e-07, + "loss": 0.5306, + "step": 12914 + }, + { + "epoch": 4.767536303224219, + "grad_norm": 0.9467688798904419, + "learning_rate": 5.07425143884126e-07, + "loss": 0.5888, + "step": 12915 + }, + { + "epoch": 4.767905488555255, + "grad_norm": 0.975978434085846, + "learning_rate": 5.0713144044921e-07, + "loss": 0.5701, + "step": 12916 + }, + { + "epoch": 4.768274673886291, + "grad_norm": 0.9679338932037354, + "learning_rate": 5.068378124437407e-07, + "loss": 0.5548, + "step": 12917 + }, + { + "epoch": 4.7686438592173275, + "grad_norm": 0.9448720812797546, + "learning_rate": 5.065442598788306e-07, + "loss": 0.6097, + "step": 12918 + }, + { + "epoch": 4.769013044548363, + "grad_norm": 0.9845861792564392, + "learning_rate": 5.062507827655916e-07, + "loss": 0.5761, + "step": 12919 + }, + { + "epoch": 4.769382229879399, + "grad_norm": 0.9609671235084534, + "learning_rate": 5.059573811151314e-07, + "loss": 0.5469, + "step": 12920 + }, + { + "epoch": 4.769751415210436, + "grad_norm": 0.9485464692115784, + "learning_rate": 5.056640549385541e-07, + "loss": 0.5847, + "step": 12921 + }, + { + "epoch": 4.770120600541472, + "grad_norm": 0.9321712255477905, + "learning_rate": 5.053708042469638e-07, + "loss": 0.5908, + "step": 12922 + }, + { + "epoch": 4.770489785872508, + "grad_norm": 0.9852068424224854, + "learning_rate": 5.050776290514589e-07, + "loss": 0.5504, + "step": 12923 + }, + { + "epoch": 4.770858971203544, + "grad_norm": 0.9274271726608276, + "learning_rate": 5.047845293631356e-07, + "loss": 0.5504, + "step": 12924 + }, + { + "epoch": 4.771228156534581, + "grad_norm": 0.9616203904151917, + "learning_rate": 5.044915051930888e-07, + "loss": 0.5693, + "step": 12925 + }, + { + "epoch": 4.771597341865617, + "grad_norm": 0.9647383093833923, + "learning_rate": 5.041985565524079e-07, + "loss": 0.5628, + "step": 12926 + }, + { + "epoch": 4.7719665271966525, + "grad_norm": 0.9837262034416199, + "learning_rate": 5.039056834521833e-07, + "loss": 0.6039, + "step": 12927 + }, + { + "epoch": 4.772335712527688, + "grad_norm": 0.9715542793273926, + "learning_rate": 5.036128859034972e-07, + "loss": 0.5737, + "step": 12928 + }, + { + "epoch": 4.772704897858725, + "grad_norm": 0.9788306951522827, + "learning_rate": 5.033201639174331e-07, + "loss": 0.5857, + "step": 12929 + }, + { + "epoch": 4.773074083189761, + "grad_norm": 0.9644140005111694, + "learning_rate": 5.030275175050716e-07, + "loss": 0.6196, + "step": 12930 + }, + { + "epoch": 4.773443268520797, + "grad_norm": 0.9657714366912842, + "learning_rate": 5.027349466774884e-07, + "loss": 0.5841, + "step": 12931 + }, + { + "epoch": 4.773812453851834, + "grad_norm": 1.0031718015670776, + "learning_rate": 5.024424514457568e-07, + "loss": 0.5819, + "step": 12932 + }, + { + "epoch": 4.77418163918287, + "grad_norm": 0.9299526810646057, + "learning_rate": 5.021500318209474e-07, + "loss": 0.592, + "step": 12933 + }, + { + "epoch": 4.774550824513906, + "grad_norm": 0.9405717253684998, + "learning_rate": 5.018576878141287e-07, + "loss": 0.5648, + "step": 12934 + }, + { + "epoch": 4.7749200098449425, + "grad_norm": 0.9539112448692322, + "learning_rate": 5.015654194363673e-07, + "loss": 0.5824, + "step": 12935 + }, + { + "epoch": 4.775289195175978, + "grad_norm": 0.9924980998039246, + "learning_rate": 5.012732266987227e-07, + "loss": 0.581, + "step": 12936 + }, + { + "epoch": 4.775658380507014, + "grad_norm": 0.9615486860275269, + "learning_rate": 5.009811096122558e-07, + "loss": 0.5564, + "step": 12937 + }, + { + "epoch": 4.776027565838051, + "grad_norm": 0.9737609028816223, + "learning_rate": 5.006890681880236e-07, + "loss": 0.5406, + "step": 12938 + }, + { + "epoch": 4.776396751169087, + "grad_norm": 0.9409693479537964, + "learning_rate": 5.00397102437079e-07, + "loss": 0.5415, + "step": 12939 + }, + { + "epoch": 4.776765936500123, + "grad_norm": 0.9560253024101257, + "learning_rate": 5.001052123704733e-07, + "loss": 0.5882, + "step": 12940 + }, + { + "epoch": 4.77713512183116, + "grad_norm": 0.9520636200904846, + "learning_rate": 4.998133979992534e-07, + "loss": 0.5545, + "step": 12941 + }, + { + "epoch": 4.777504307162196, + "grad_norm": 0.9526748657226562, + "learning_rate": 4.995216593344656e-07, + "loss": 0.5624, + "step": 12942 + }, + { + "epoch": 4.777873492493232, + "grad_norm": 0.9568095207214355, + "learning_rate": 4.992299963871516e-07, + "loss": 0.575, + "step": 12943 + }, + { + "epoch": 4.7782426778242675, + "grad_norm": 0.9406058192253113, + "learning_rate": 4.989384091683502e-07, + "loss": 0.5555, + "step": 12944 + }, + { + "epoch": 4.778611863155304, + "grad_norm": 0.9632020592689514, + "learning_rate": 4.986468976890993e-07, + "loss": 0.5687, + "step": 12945 + }, + { + "epoch": 4.77898104848634, + "grad_norm": 0.9953078031539917, + "learning_rate": 4.983554619604308e-07, + "loss": 0.5914, + "step": 12946 + }, + { + "epoch": 4.779350233817376, + "grad_norm": 0.9798644781112671, + "learning_rate": 4.980641019933771e-07, + "loss": 0.5836, + "step": 12947 + }, + { + "epoch": 4.779719419148412, + "grad_norm": 0.9613191485404968, + "learning_rate": 4.977728177989655e-07, + "loss": 0.6096, + "step": 12948 + }, + { + "epoch": 4.780088604479449, + "grad_norm": 0.9675655961036682, + "learning_rate": 4.974816093882199e-07, + "loss": 0.5715, + "step": 12949 + }, + { + "epoch": 4.780457789810485, + "grad_norm": 0.9700829982757568, + "learning_rate": 4.971904767721642e-07, + "loss": 0.5526, + "step": 12950 + }, + { + "epoch": 4.780826975141521, + "grad_norm": 0.9470189213752747, + "learning_rate": 4.968994199618169e-07, + "loss": 0.5247, + "step": 12951 + }, + { + "epoch": 4.7811961604725575, + "grad_norm": 0.9294775128364563, + "learning_rate": 4.966084389681932e-07, + "loss": 0.5444, + "step": 12952 + }, + { + "epoch": 4.781565345803593, + "grad_norm": 0.9473828673362732, + "learning_rate": 4.963175338023091e-07, + "loss": 0.5919, + "step": 12953 + }, + { + "epoch": 4.781934531134629, + "grad_norm": 1.0572129487991333, + "learning_rate": 4.96026704475173e-07, + "loss": 0.5717, + "step": 12954 + }, + { + "epoch": 4.782303716465666, + "grad_norm": 0.9351916909217834, + "learning_rate": 4.957359509977952e-07, + "loss": 0.5583, + "step": 12955 + }, + { + "epoch": 4.782672901796702, + "grad_norm": 0.9589850902557373, + "learning_rate": 4.954452733811777e-07, + "loss": 0.565, + "step": 12956 + }, + { + "epoch": 4.783042087127738, + "grad_norm": 0.9688576459884644, + "learning_rate": 4.951546716363237e-07, + "loss": 0.5761, + "step": 12957 + }, + { + "epoch": 4.783411272458775, + "grad_norm": 0.9485651254653931, + "learning_rate": 4.948641457742334e-07, + "loss": 0.5494, + "step": 12958 + }, + { + "epoch": 4.783780457789811, + "grad_norm": 0.9422640800476074, + "learning_rate": 4.945736958059025e-07, + "loss": 0.5819, + "step": 12959 + }, + { + "epoch": 4.784149643120847, + "grad_norm": 0.9551543593406677, + "learning_rate": 4.942833217423242e-07, + "loss": 0.5903, + "step": 12960 + }, + { + "epoch": 4.7845188284518825, + "grad_norm": 0.953154444694519, + "learning_rate": 4.939930235944886e-07, + "loss": 0.5851, + "step": 12961 + }, + { + "epoch": 4.784888013782919, + "grad_norm": 0.9545913934707642, + "learning_rate": 4.937028013733846e-07, + "loss": 0.5851, + "step": 12962 + }, + { + "epoch": 4.785257199113955, + "grad_norm": 0.9532111287117004, + "learning_rate": 4.934126550899962e-07, + "loss": 0.6347, + "step": 12963 + }, + { + "epoch": 4.785626384444991, + "grad_norm": 1.0322078466415405, + "learning_rate": 4.931225847553051e-07, + "loss": 0.5839, + "step": 12964 + }, + { + "epoch": 4.785995569776028, + "grad_norm": 1.0029598474502563, + "learning_rate": 4.928325903802913e-07, + "loss": 0.6082, + "step": 12965 + }, + { + "epoch": 4.786364755107064, + "grad_norm": 0.9600620865821838, + "learning_rate": 4.925426719759299e-07, + "loss": 0.5605, + "step": 12966 + }, + { + "epoch": 4.7867339404381, + "grad_norm": 0.9522488713264465, + "learning_rate": 4.922528295531953e-07, + "loss": 0.5268, + "step": 12967 + }, + { + "epoch": 4.787103125769136, + "grad_norm": 0.9454140067100525, + "learning_rate": 4.919630631230576e-07, + "loss": 0.5691, + "step": 12968 + }, + { + "epoch": 4.7874723111001725, + "grad_norm": 0.9789222478866577, + "learning_rate": 4.916733726964837e-07, + "loss": 0.5402, + "step": 12969 + }, + { + "epoch": 4.787841496431208, + "grad_norm": 0.9149588346481323, + "learning_rate": 4.913837582844391e-07, + "loss": 0.5508, + "step": 12970 + }, + { + "epoch": 4.788210681762244, + "grad_norm": 0.9586299657821655, + "learning_rate": 4.910942198978855e-07, + "loss": 0.5696, + "step": 12971 + }, + { + "epoch": 4.788579867093281, + "grad_norm": 0.9575399160385132, + "learning_rate": 4.908047575477809e-07, + "loss": 0.5835, + "step": 12972 + }, + { + "epoch": 4.788949052424317, + "grad_norm": 0.9366751313209534, + "learning_rate": 4.90515371245083e-07, + "loss": 0.5806, + "step": 12973 + }, + { + "epoch": 4.789318237755353, + "grad_norm": 0.9151009917259216, + "learning_rate": 4.902260610007433e-07, + "loss": 0.5711, + "step": 12974 + }, + { + "epoch": 4.78968742308639, + "grad_norm": 0.9411727786064148, + "learning_rate": 4.899368268257135e-07, + "loss": 0.5565, + "step": 12975 + }, + { + "epoch": 4.790056608417426, + "grad_norm": 0.962827205657959, + "learning_rate": 4.896476687309407e-07, + "loss": 0.5892, + "step": 12976 + }, + { + "epoch": 4.790425793748462, + "grad_norm": 0.9628832340240479, + "learning_rate": 4.893585867273681e-07, + "loss": 0.5779, + "step": 12977 + }, + { + "epoch": 4.790794979079498, + "grad_norm": 0.9602428078651428, + "learning_rate": 4.890695808259394e-07, + "loss": 0.6003, + "step": 12978 + }, + { + "epoch": 4.791164164410534, + "grad_norm": 0.9287784099578857, + "learning_rate": 4.887806510375923e-07, + "loss": 0.5385, + "step": 12979 + }, + { + "epoch": 4.79153334974157, + "grad_norm": 0.9264265894889832, + "learning_rate": 4.88491797373262e-07, + "loss": 0.5606, + "step": 12980 + }, + { + "epoch": 4.791902535072606, + "grad_norm": 0.9782569408416748, + "learning_rate": 4.88203019843883e-07, + "loss": 0.5537, + "step": 12981 + }, + { + "epoch": 4.792271720403643, + "grad_norm": 0.9465296864509583, + "learning_rate": 4.879143184603849e-07, + "loss": 0.5916, + "step": 12982 + }, + { + "epoch": 4.792640905734679, + "grad_norm": 0.9871791005134583, + "learning_rate": 4.876256932336948e-07, + "loss": 0.6151, + "step": 12983 + }, + { + "epoch": 4.793010091065715, + "grad_norm": 0.9497408866882324, + "learning_rate": 4.873371441747366e-07, + "loss": 0.5618, + "step": 12984 + }, + { + "epoch": 4.7933792763967515, + "grad_norm": 0.9980105757713318, + "learning_rate": 4.870486712944319e-07, + "loss": 0.5925, + "step": 12985 + }, + { + "epoch": 4.7937484617277875, + "grad_norm": 0.9791030883789062, + "learning_rate": 4.867602746037006e-07, + "loss": 0.5632, + "step": 12986 + }, + { + "epoch": 4.794117647058823, + "grad_norm": 0.9836077094078064, + "learning_rate": 4.864719541134577e-07, + "loss": 0.5469, + "step": 12987 + }, + { + "epoch": 4.794486832389859, + "grad_norm": 0.9858665466308594, + "learning_rate": 4.861837098346156e-07, + "loss": 0.5781, + "step": 12988 + }, + { + "epoch": 4.794856017720896, + "grad_norm": 0.9910055994987488, + "learning_rate": 4.858955417780839e-07, + "loss": 0.6149, + "step": 12989 + }, + { + "epoch": 4.795225203051932, + "grad_norm": 1.015304446220398, + "learning_rate": 4.856074499547709e-07, + "loss": 0.5915, + "step": 12990 + }, + { + "epoch": 4.795594388382968, + "grad_norm": 0.969216525554657, + "learning_rate": 4.853194343755801e-07, + "loss": 0.5798, + "step": 12991 + }, + { + "epoch": 4.795963573714005, + "grad_norm": 0.9755946397781372, + "learning_rate": 4.850314950514124e-07, + "loss": 0.5725, + "step": 12992 + }, + { + "epoch": 4.796332759045041, + "grad_norm": 0.9705687761306763, + "learning_rate": 4.847436319931673e-07, + "loss": 0.5916, + "step": 12993 + }, + { + "epoch": 4.7967019443760766, + "grad_norm": 0.9638211727142334, + "learning_rate": 4.84455845211739e-07, + "loss": 0.556, + "step": 12994 + }, + { + "epoch": 4.797071129707113, + "grad_norm": 0.9814696907997131, + "learning_rate": 4.841681347180216e-07, + "loss": 0.5989, + "step": 12995 + }, + { + "epoch": 4.797440315038149, + "grad_norm": 0.9551864266395569, + "learning_rate": 4.838805005229039e-07, + "loss": 0.5523, + "step": 12996 + }, + { + "epoch": 4.797809500369185, + "grad_norm": 0.975407600402832, + "learning_rate": 4.835929426372724e-07, + "loss": 0.5999, + "step": 12997 + }, + { + "epoch": 4.798178685700222, + "grad_norm": 0.9465740323066711, + "learning_rate": 4.833054610720123e-07, + "loss": 0.5726, + "step": 12998 + }, + { + "epoch": 4.798547871031258, + "grad_norm": 0.9572254419326782, + "learning_rate": 4.83018055838004e-07, + "loss": 0.5751, + "step": 12999 + }, + { + "epoch": 4.798917056362294, + "grad_norm": 0.9777554869651794, + "learning_rate": 4.827307269461251e-07, + "loss": 0.5734, + "step": 13000 + }, + { + "epoch": 4.79928624169333, + "grad_norm": 0.9848794937133789, + "learning_rate": 4.82443474407252e-07, + "loss": 0.5494, + "step": 13001 + }, + { + "epoch": 4.7996554270243665, + "grad_norm": 0.9487075805664062, + "learning_rate": 4.82156298232257e-07, + "loss": 0.5667, + "step": 13002 + }, + { + "epoch": 4.8000246123554025, + "grad_norm": 0.923477053642273, + "learning_rate": 4.81869198432009e-07, + "loss": 0.6084, + "step": 13003 + }, + { + "epoch": 4.800393797686438, + "grad_norm": 0.9727926254272461, + "learning_rate": 4.815821750173743e-07, + "loss": 0.5743, + "step": 13004 + }, + { + "epoch": 4.800762983017474, + "grad_norm": 0.9406960010528564, + "learning_rate": 4.812952279992172e-07, + "loss": 0.5766, + "step": 13005 + }, + { + "epoch": 4.801132168348511, + "grad_norm": 0.9847185611724854, + "learning_rate": 4.810083573883995e-07, + "loss": 0.5879, + "step": 13006 + }, + { + "epoch": 4.801501353679547, + "grad_norm": 0.9554351568222046, + "learning_rate": 4.807215631957782e-07, + "loss": 0.5838, + "step": 13007 + }, + { + "epoch": 4.801870539010583, + "grad_norm": 0.9641968607902527, + "learning_rate": 4.804348454322074e-07, + "loss": 0.5655, + "step": 13008 + }, + { + "epoch": 4.80223972434162, + "grad_norm": 0.9540976285934448, + "learning_rate": 4.801482041085415e-07, + "loss": 0.5884, + "step": 13009 + }, + { + "epoch": 4.802608909672656, + "grad_norm": 0.9928523302078247, + "learning_rate": 4.798616392356281e-07, + "loss": 0.59, + "step": 13010 + }, + { + "epoch": 4.8029780950036915, + "grad_norm": 0.9778512716293335, + "learning_rate": 4.795751508243146e-07, + "loss": 0.5941, + "step": 13011 + }, + { + "epoch": 4.803347280334728, + "grad_norm": 0.9656241536140442, + "learning_rate": 4.792887388854431e-07, + "loss": 0.5948, + "step": 13012 + }, + { + "epoch": 4.803716465665764, + "grad_norm": 0.965020477771759, + "learning_rate": 4.790024034298548e-07, + "loss": 0.5808, + "step": 13013 + }, + { + "epoch": 4.8040856509968, + "grad_norm": 0.9553949236869812, + "learning_rate": 4.787161444683889e-07, + "loss": 0.6022, + "step": 13014 + }, + { + "epoch": 4.804454836327837, + "grad_norm": 0.951023280620575, + "learning_rate": 4.784299620118785e-07, + "loss": 0.5875, + "step": 13015 + }, + { + "epoch": 4.804824021658873, + "grad_norm": 0.9709046483039856, + "learning_rate": 4.781438560711563e-07, + "loss": 0.556, + "step": 13016 + }, + { + "epoch": 4.805193206989909, + "grad_norm": 0.9450684189796448, + "learning_rate": 4.778578266570503e-07, + "loss": 0.5461, + "step": 13017 + }, + { + "epoch": 4.805562392320945, + "grad_norm": 0.9225620031356812, + "learning_rate": 4.775718737803881e-07, + "loss": 0.5525, + "step": 13018 + }, + { + "epoch": 4.8059315776519815, + "grad_norm": 0.9523782730102539, + "learning_rate": 4.772859974519922e-07, + "loss": 0.6072, + "step": 13019 + }, + { + "epoch": 4.8063007629830174, + "grad_norm": 0.9876287579536438, + "learning_rate": 4.770001976826821e-07, + "loss": 0.601, + "step": 13020 + }, + { + "epoch": 4.806669948314053, + "grad_norm": 0.9437212944030762, + "learning_rate": 4.767144744832769e-07, + "loss": 0.5538, + "step": 13021 + }, + { + "epoch": 4.80703913364509, + "grad_norm": 0.9789315462112427, + "learning_rate": 4.764288278645901e-07, + "loss": 0.5922, + "step": 13022 + }, + { + "epoch": 4.807408318976126, + "grad_norm": 0.941317617893219, + "learning_rate": 4.761432578374331e-07, + "loss": 0.5852, + "step": 13023 + }, + { + "epoch": 4.807777504307162, + "grad_norm": 0.9587847590446472, + "learning_rate": 4.7585776441261555e-07, + "loss": 0.6006, + "step": 13024 + }, + { + "epoch": 4.808146689638198, + "grad_norm": 0.9923948049545288, + "learning_rate": 4.755723476009419e-07, + "loss": 0.6092, + "step": 13025 + }, + { + "epoch": 4.808515874969235, + "grad_norm": 0.9370457530021667, + "learning_rate": 4.752870074132171e-07, + "loss": 0.5521, + "step": 13026 + }, + { + "epoch": 4.808885060300271, + "grad_norm": 0.9486737847328186, + "learning_rate": 4.7500174386023975e-07, + "loss": 0.5839, + "step": 13027 + }, + { + "epoch": 4.8092542456313065, + "grad_norm": 0.9999043345451355, + "learning_rate": 4.747165569528067e-07, + "loss": 0.589, + "step": 13028 + }, + { + "epoch": 4.809623430962343, + "grad_norm": 0.9791700839996338, + "learning_rate": 4.744314467017133e-07, + "loss": 0.5681, + "step": 13029 + }, + { + "epoch": 4.809992616293379, + "grad_norm": 0.9323267936706543, + "learning_rate": 4.741464131177506e-07, + "loss": 0.5746, + "step": 13030 + }, + { + "epoch": 4.810361801624415, + "grad_norm": 0.9661452770233154, + "learning_rate": 4.7386145621170664e-07, + "loss": 0.5867, + "step": 13031 + }, + { + "epoch": 4.810730986955452, + "grad_norm": 0.9621000289916992, + "learning_rate": 4.735765759943667e-07, + "loss": 0.5826, + "step": 13032 + }, + { + "epoch": 4.811100172286488, + "grad_norm": 0.9545723795890808, + "learning_rate": 4.7329177247651347e-07, + "loss": 0.5708, + "step": 13033 + }, + { + "epoch": 4.811469357617524, + "grad_norm": 0.9334413409233093, + "learning_rate": 4.7300704566892786e-07, + "loss": 0.5484, + "step": 13034 + }, + { + "epoch": 4.811838542948561, + "grad_norm": 0.9710033535957336, + "learning_rate": 4.7272239558238594e-07, + "loss": 0.5762, + "step": 13035 + }, + { + "epoch": 4.8122077282795965, + "grad_norm": 0.9515325427055359, + "learning_rate": 4.724378222276613e-07, + "loss": 0.5901, + "step": 13036 + }, + { + "epoch": 4.8125769136106324, + "grad_norm": 1.003464937210083, + "learning_rate": 4.7215332561552475e-07, + "loss": 0.588, + "step": 13037 + }, + { + "epoch": 4.812946098941668, + "grad_norm": 0.9768511056900024, + "learning_rate": 4.718689057567452e-07, + "loss": 0.5564, + "step": 13038 + }, + { + "epoch": 4.813315284272705, + "grad_norm": 0.9579365253448486, + "learning_rate": 4.7158456266208767e-07, + "loss": 0.5541, + "step": 13039 + }, + { + "epoch": 4.813684469603741, + "grad_norm": 0.9692876935005188, + "learning_rate": 4.7130029634231355e-07, + "loss": 0.5935, + "step": 13040 + }, + { + "epoch": 4.814053654934777, + "grad_norm": 0.9817963242530823, + "learning_rate": 4.710161068081837e-07, + "loss": 0.5835, + "step": 13041 + }, + { + "epoch": 4.814422840265814, + "grad_norm": 0.9948225617408752, + "learning_rate": 4.7073199407045383e-07, + "loss": 0.5892, + "step": 13042 + }, + { + "epoch": 4.81479202559685, + "grad_norm": 0.9660698771476746, + "learning_rate": 4.7044795813987657e-07, + "loss": 0.6159, + "step": 13043 + }, + { + "epoch": 4.815161210927886, + "grad_norm": 0.9873417615890503, + "learning_rate": 4.701639990272047e-07, + "loss": 0.5616, + "step": 13044 + }, + { + "epoch": 4.8155303962589215, + "grad_norm": 0.9862447381019592, + "learning_rate": 4.6988011674318376e-07, + "loss": 0.6059, + "step": 13045 + }, + { + "epoch": 4.815899581589958, + "grad_norm": 0.9527393579483032, + "learning_rate": 4.695963112985608e-07, + "loss": 0.5781, + "step": 13046 + }, + { + "epoch": 4.816268766920994, + "grad_norm": 0.9797887802124023, + "learning_rate": 4.693125827040762e-07, + "loss": 0.5583, + "step": 13047 + }, + { + "epoch": 4.81663795225203, + "grad_norm": 0.9210270047187805, + "learning_rate": 4.6902893097046904e-07, + "loss": 0.5765, + "step": 13048 + }, + { + "epoch": 4.817007137583067, + "grad_norm": 0.9492358565330505, + "learning_rate": 4.687453561084765e-07, + "loss": 0.5763, + "step": 13049 + }, + { + "epoch": 4.817376322914103, + "grad_norm": 0.9646016955375671, + "learning_rate": 4.684618581288311e-07, + "loss": 0.6047, + "step": 13050 + }, + { + "epoch": 4.817745508245139, + "grad_norm": 0.9938595294952393, + "learning_rate": 4.681784370422626e-07, + "loss": 0.5705, + "step": 13051 + }, + { + "epoch": 4.818114693576176, + "grad_norm": 0.9678547978401184, + "learning_rate": 4.6789509285949967e-07, + "loss": 0.5423, + "step": 13052 + }, + { + "epoch": 4.8184838789072115, + "grad_norm": 0.931814432144165, + "learning_rate": 4.676118255912654e-07, + "loss": 0.5613, + "step": 13053 + }, + { + "epoch": 4.818853064238247, + "grad_norm": 0.9598156809806824, + "learning_rate": 4.673286352482828e-07, + "loss": 0.5408, + "step": 13054 + }, + { + "epoch": 4.819222249569284, + "grad_norm": 0.9528682827949524, + "learning_rate": 4.670455218412698e-07, + "loss": 0.5546, + "step": 13055 + }, + { + "epoch": 4.81959143490032, + "grad_norm": 0.9515007734298706, + "learning_rate": 4.667624853809416e-07, + "loss": 0.6031, + "step": 13056 + }, + { + "epoch": 4.819960620231356, + "grad_norm": 0.9634791016578674, + "learning_rate": 4.6647952587801223e-07, + "loss": 0.5488, + "step": 13057 + }, + { + "epoch": 4.820329805562392, + "grad_norm": 0.9287925958633423, + "learning_rate": 4.661966433431911e-07, + "loss": 0.5818, + "step": 13058 + }, + { + "epoch": 4.820698990893429, + "grad_norm": 0.9584846496582031, + "learning_rate": 4.6591383778718495e-07, + "loss": 0.5516, + "step": 13059 + }, + { + "epoch": 4.821068176224465, + "grad_norm": 0.9359522461891174, + "learning_rate": 4.6563110922069747e-07, + "loss": 0.5902, + "step": 13060 + }, + { + "epoch": 4.821437361555501, + "grad_norm": 0.915651798248291, + "learning_rate": 4.6534845765443066e-07, + "loss": 0.5366, + "step": 13061 + }, + { + "epoch": 4.821806546886537, + "grad_norm": 0.9655027985572815, + "learning_rate": 4.650658830990831e-07, + "loss": 0.5975, + "step": 13062 + }, + { + "epoch": 4.822175732217573, + "grad_norm": 0.9609534740447998, + "learning_rate": 4.647833855653494e-07, + "loss": 0.5711, + "step": 13063 + }, + { + "epoch": 4.822544917548609, + "grad_norm": 0.9107581377029419, + "learning_rate": 4.6450096506392263e-07, + "loss": 0.5275, + "step": 13064 + }, + { + "epoch": 4.822914102879645, + "grad_norm": 0.9243404865264893, + "learning_rate": 4.6421862160549116e-07, + "loss": 0.5726, + "step": 13065 + }, + { + "epoch": 4.823283288210682, + "grad_norm": 0.9646216034889221, + "learning_rate": 4.639363552007428e-07, + "loss": 0.6193, + "step": 13066 + }, + { + "epoch": 4.823652473541718, + "grad_norm": 0.9483696222305298, + "learning_rate": 4.6365416586036103e-07, + "loss": 0.5931, + "step": 13067 + }, + { + "epoch": 4.824021658872754, + "grad_norm": 0.9721295237541199, + "learning_rate": 4.6337205359502565e-07, + "loss": 0.574, + "step": 13068 + }, + { + "epoch": 4.824390844203791, + "grad_norm": 0.9653038382530212, + "learning_rate": 4.630900184154158e-07, + "loss": 0.5496, + "step": 13069 + }, + { + "epoch": 4.8247600295348265, + "grad_norm": 0.9679992198944092, + "learning_rate": 4.6280806033220606e-07, + "loss": 0.57, + "step": 13070 + }, + { + "epoch": 4.825129214865862, + "grad_norm": 0.9687848091125488, + "learning_rate": 4.6252617935606757e-07, + "loss": 0.5431, + "step": 13071 + }, + { + "epoch": 4.825498400196899, + "grad_norm": 0.9451284408569336, + "learning_rate": 4.6224437549767106e-07, + "loss": 0.5567, + "step": 13072 + }, + { + "epoch": 4.825867585527935, + "grad_norm": 0.9511160850524902, + "learning_rate": 4.6196264876768095e-07, + "loss": 0.5528, + "step": 13073 + }, + { + "epoch": 4.826236770858971, + "grad_norm": 0.9579457640647888, + "learning_rate": 4.616809991767621e-07, + "loss": 0.5727, + "step": 13074 + }, + { + "epoch": 4.826605956190008, + "grad_norm": 1.0209715366363525, + "learning_rate": 4.6139942673557397e-07, + "loss": 0.5665, + "step": 13075 + }, + { + "epoch": 4.826975141521044, + "grad_norm": 0.9699881672859192, + "learning_rate": 4.611179314547734e-07, + "loss": 0.5938, + "step": 13076 + }, + { + "epoch": 4.82734432685208, + "grad_norm": 0.9955794215202332, + "learning_rate": 4.608365133450166e-07, + "loss": 0.5788, + "step": 13077 + }, + { + "epoch": 4.827713512183116, + "grad_norm": 0.9673070907592773, + "learning_rate": 4.605551724169541e-07, + "loss": 0.5701, + "step": 13078 + }, + { + "epoch": 4.828082697514152, + "grad_norm": 0.9819490313529968, + "learning_rate": 4.602739086812347e-07, + "loss": 0.5521, + "step": 13079 + }, + { + "epoch": 4.828451882845188, + "grad_norm": 0.9433984756469727, + "learning_rate": 4.599927221485034e-07, + "loss": 0.5943, + "step": 13080 + }, + { + "epoch": 4.828821068176224, + "grad_norm": 0.9759746193885803, + "learning_rate": 4.597116128294035e-07, + "loss": 0.568, + "step": 13081 + }, + { + "epoch": 4.82919025350726, + "grad_norm": 0.9819923639297485, + "learning_rate": 4.5943058073457616e-07, + "loss": 0.5752, + "step": 13082 + }, + { + "epoch": 4.829559438838297, + "grad_norm": 0.9766935706138611, + "learning_rate": 4.5914962587465723e-07, + "loss": 0.5604, + "step": 13083 + }, + { + "epoch": 4.829928624169333, + "grad_norm": 0.9890244007110596, + "learning_rate": 4.5886874826027993e-07, + "loss": 0.6087, + "step": 13084 + }, + { + "epoch": 4.830297809500369, + "grad_norm": 0.947257399559021, + "learning_rate": 4.5858794790207743e-07, + "loss": 0.6165, + "step": 13085 + }, + { + "epoch": 4.830666994831406, + "grad_norm": 0.959202766418457, + "learning_rate": 4.583072248106765e-07, + "loss": 0.5866, + "step": 13086 + }, + { + "epoch": 4.8310361801624415, + "grad_norm": 0.9719499945640564, + "learning_rate": 4.5802657899670283e-07, + "loss": 0.5532, + "step": 13087 + }, + { + "epoch": 4.831405365493477, + "grad_norm": 0.9840555787086487, + "learning_rate": 4.577460104707779e-07, + "loss": 0.5498, + "step": 13088 + }, + { + "epoch": 4.831774550824514, + "grad_norm": 0.9442031383514404, + "learning_rate": 4.5746551924352206e-07, + "loss": 0.5446, + "step": 13089 + }, + { + "epoch": 4.83214373615555, + "grad_norm": 0.9386102557182312, + "learning_rate": 4.57185105325553e-07, + "loss": 0.5439, + "step": 13090 + }, + { + "epoch": 4.832512921486586, + "grad_norm": 0.9735742211341858, + "learning_rate": 4.569047687274816e-07, + "loss": 0.5972, + "step": 13091 + }, + { + "epoch": 4.832882106817623, + "grad_norm": 0.9839740991592407, + "learning_rate": 4.5662450945992086e-07, + "loss": 0.5821, + "step": 13092 + }, + { + "epoch": 4.833251292148659, + "grad_norm": 0.975988507270813, + "learning_rate": 4.563443275334764e-07, + "loss": 0.6044, + "step": 13093 + }, + { + "epoch": 4.833620477479695, + "grad_norm": 0.9864599108695984, + "learning_rate": 4.560642229587553e-07, + "loss": 0.5953, + "step": 13094 + }, + { + "epoch": 4.833989662810731, + "grad_norm": 0.9673764109611511, + "learning_rate": 4.5578419574635794e-07, + "loss": 0.5499, + "step": 13095 + }, + { + "epoch": 4.834358848141767, + "grad_norm": 0.9564242959022522, + "learning_rate": 4.5550424590688315e-07, + "loss": 0.5587, + "step": 13096 + }, + { + "epoch": 4.834728033472803, + "grad_norm": 0.9160134792327881, + "learning_rate": 4.552243734509279e-07, + "loss": 0.5393, + "step": 13097 + }, + { + "epoch": 4.835097218803839, + "grad_norm": 0.9507557153701782, + "learning_rate": 4.5494457838908464e-07, + "loss": 0.5912, + "step": 13098 + }, + { + "epoch": 4.835466404134876, + "grad_norm": 0.9205230474472046, + "learning_rate": 4.5466486073194316e-07, + "loss": 0.5422, + "step": 13099 + }, + { + "epoch": 4.835835589465912, + "grad_norm": 0.9548346400260925, + "learning_rate": 4.54385220490092e-07, + "loss": 0.5646, + "step": 13100 + }, + { + "epoch": 4.836204774796948, + "grad_norm": 0.9546959400177002, + "learning_rate": 4.5410565767411377e-07, + "loss": 0.5708, + "step": 13101 + }, + { + "epoch": 4.836573960127984, + "grad_norm": 0.9634102582931519, + "learning_rate": 4.5382617229459135e-07, + "loss": 0.607, + "step": 13102 + }, + { + "epoch": 4.836943145459021, + "grad_norm": 0.973002016544342, + "learning_rate": 4.5354676436210277e-07, + "loss": 0.5698, + "step": 13103 + }, + { + "epoch": 4.8373123307900565, + "grad_norm": 0.9584357738494873, + "learning_rate": 4.5326743388722253e-07, + "loss": 0.5498, + "step": 13104 + }, + { + "epoch": 4.837681516121092, + "grad_norm": 0.9592992067337036, + "learning_rate": 4.5298818088052447e-07, + "loss": 0.5724, + "step": 13105 + }, + { + "epoch": 4.838050701452129, + "grad_norm": 0.9804208874702454, + "learning_rate": 4.527090053525779e-07, + "loss": 0.568, + "step": 13106 + }, + { + "epoch": 4.838419886783165, + "grad_norm": 0.9417576193809509, + "learning_rate": 4.52429907313949e-07, + "loss": 0.605, + "step": 13107 + }, + { + "epoch": 4.838789072114201, + "grad_norm": 0.9647360444068909, + "learning_rate": 4.521508867752017e-07, + "loss": 0.5571, + "step": 13108 + }, + { + "epoch": 4.839158257445238, + "grad_norm": 0.9518178701400757, + "learning_rate": 4.518719437468966e-07, + "loss": 0.5749, + "step": 13109 + }, + { + "epoch": 4.839527442776274, + "grad_norm": 0.9731206297874451, + "learning_rate": 4.5159307823959357e-07, + "loss": 0.5726, + "step": 13110 + }, + { + "epoch": 4.83989662810731, + "grad_norm": 0.9686379432678223, + "learning_rate": 4.513142902638448e-07, + "loss": 0.5586, + "step": 13111 + }, + { + "epoch": 4.8402658134383465, + "grad_norm": 0.9726652503013611, + "learning_rate": 4.510355798302041e-07, + "loss": 0.5573, + "step": 13112 + }, + { + "epoch": 4.840634998769382, + "grad_norm": 0.9493438601493835, + "learning_rate": 4.507569469492193e-07, + "loss": 0.5467, + "step": 13113 + }, + { + "epoch": 4.841004184100418, + "grad_norm": 0.9672284722328186, + "learning_rate": 4.5047839163143816e-07, + "loss": 0.5718, + "step": 13114 + }, + { + "epoch": 4.841373369431454, + "grad_norm": 0.9343790411949158, + "learning_rate": 4.50199913887403e-07, + "loss": 0.5856, + "step": 13115 + }, + { + "epoch": 4.841742554762491, + "grad_norm": 0.9758760333061218, + "learning_rate": 4.499215137276536e-07, + "loss": 0.5776, + "step": 13116 + }, + { + "epoch": 4.842111740093527, + "grad_norm": 0.9665729403495789, + "learning_rate": 4.496431911627283e-07, + "loss": 0.5688, + "step": 13117 + }, + { + "epoch": 4.842480925424563, + "grad_norm": 0.9662891626358032, + "learning_rate": 4.4936494620316127e-07, + "loss": 0.5533, + "step": 13118 + }, + { + "epoch": 4.8428501107556, + "grad_norm": 0.9902126789093018, + "learning_rate": 4.4908677885948324e-07, + "loss": 0.5774, + "step": 13119 + }, + { + "epoch": 4.843219296086636, + "grad_norm": 0.9906224012374878, + "learning_rate": 4.488086891422239e-07, + "loss": 0.5966, + "step": 13120 + }, + { + "epoch": 4.8435884814176715, + "grad_norm": 0.9264991283416748, + "learning_rate": 4.485306770619077e-07, + "loss": 0.5815, + "step": 13121 + }, + { + "epoch": 4.843957666748707, + "grad_norm": 1.006314992904663, + "learning_rate": 4.482527426290584e-07, + "loss": 0.5673, + "step": 13122 + }, + { + "epoch": 4.844326852079744, + "grad_norm": 0.9507179856300354, + "learning_rate": 4.4797488585419545e-07, + "loss": 0.5758, + "step": 13123 + }, + { + "epoch": 4.84469603741078, + "grad_norm": 0.9513377547264099, + "learning_rate": 4.4769710674783475e-07, + "loss": 0.5457, + "step": 13124 + }, + { + "epoch": 4.845065222741816, + "grad_norm": 0.950057327747345, + "learning_rate": 4.474194053204914e-07, + "loss": 0.5868, + "step": 13125 + }, + { + "epoch": 4.845434408072853, + "grad_norm": 0.9762392640113831, + "learning_rate": 4.471417815826759e-07, + "loss": 0.5499, + "step": 13126 + }, + { + "epoch": 4.845803593403889, + "grad_norm": 1.025907039642334, + "learning_rate": 4.468642355448954e-07, + "loss": 0.5573, + "step": 13127 + }, + { + "epoch": 4.846172778734925, + "grad_norm": 0.9876940846443176, + "learning_rate": 4.465867672176563e-07, + "loss": 0.5711, + "step": 13128 + }, + { + "epoch": 4.8465419640659615, + "grad_norm": 0.9400859475135803, + "learning_rate": 4.463093766114593e-07, + "loss": 0.5223, + "step": 13129 + }, + { + "epoch": 4.846911149396997, + "grad_norm": 1.8015745878219604, + "learning_rate": 4.460320637368054e-07, + "loss": 0.5765, + "step": 13130 + }, + { + "epoch": 4.847280334728033, + "grad_norm": 0.9749242067337036, + "learning_rate": 4.457548286041885e-07, + "loss": 0.5649, + "step": 13131 + }, + { + "epoch": 4.84764952005907, + "grad_norm": 0.9700090885162354, + "learning_rate": 4.4547767122410296e-07, + "loss": 0.5952, + "step": 13132 + }, + { + "epoch": 4.848018705390106, + "grad_norm": 0.967499315738678, + "learning_rate": 4.4520059160703984e-07, + "loss": 0.572, + "step": 13133 + }, + { + "epoch": 4.848387890721142, + "grad_norm": 0.9629972577095032, + "learning_rate": 4.4492358976348573e-07, + "loss": 0.5815, + "step": 13134 + }, + { + "epoch": 4.848757076052178, + "grad_norm": 0.9519364237785339, + "learning_rate": 4.446466657039253e-07, + "loss": 0.5798, + "step": 13135 + }, + { + "epoch": 4.849126261383215, + "grad_norm": 0.9817869067192078, + "learning_rate": 4.4436981943883907e-07, + "loss": 0.5448, + "step": 13136 + }, + { + "epoch": 4.849495446714251, + "grad_norm": 0.9265947937965393, + "learning_rate": 4.4409305097870644e-07, + "loss": 0.5753, + "step": 13137 + }, + { + "epoch": 4.8498646320452865, + "grad_norm": 0.950919508934021, + "learning_rate": 4.4381636033400425e-07, + "loss": 0.585, + "step": 13138 + }, + { + "epoch": 4.850233817376322, + "grad_norm": 0.9872589111328125, + "learning_rate": 4.4353974751520286e-07, + "loss": 0.5432, + "step": 13139 + }, + { + "epoch": 4.850603002707359, + "grad_norm": 0.9561243057250977, + "learning_rate": 4.432632125327732e-07, + "loss": 0.5694, + "step": 13140 + }, + { + "epoch": 4.850972188038395, + "grad_norm": 0.9758407473564148, + "learning_rate": 4.4298675539718143e-07, + "loss": 0.587, + "step": 13141 + }, + { + "epoch": 4.851341373369431, + "grad_norm": 0.9557921290397644, + "learning_rate": 4.427103761188925e-07, + "loss": 0.5751, + "step": 13142 + }, + { + "epoch": 4.851710558700468, + "grad_norm": 0.9832878112792969, + "learning_rate": 4.424340747083664e-07, + "loss": 0.5927, + "step": 13143 + }, + { + "epoch": 4.852079744031504, + "grad_norm": 0.9684576988220215, + "learning_rate": 4.4215785117606074e-07, + "loss": 0.5667, + "step": 13144 + }, + { + "epoch": 4.85244892936254, + "grad_norm": 0.9129310250282288, + "learning_rate": 4.418817055324315e-07, + "loss": 0.5801, + "step": 13145 + }, + { + "epoch": 4.8528181146935765, + "grad_norm": 0.9562647342681885, + "learning_rate": 4.416056377879299e-07, + "loss": 0.5931, + "step": 13146 + }, + { + "epoch": 4.853187300024612, + "grad_norm": 0.9952211380004883, + "learning_rate": 4.413296479530049e-07, + "loss": 0.5907, + "step": 13147 + }, + { + "epoch": 4.853556485355648, + "grad_norm": 0.9570009112358093, + "learning_rate": 4.410537360381034e-07, + "loss": 0.5732, + "step": 13148 + }, + { + "epoch": 4.853925670686685, + "grad_norm": 0.9713780283927917, + "learning_rate": 4.4077790205366787e-07, + "loss": 0.5816, + "step": 13149 + }, + { + "epoch": 4.854294856017721, + "grad_norm": 0.9787811636924744, + "learning_rate": 4.4050214601013995e-07, + "loss": 0.5475, + "step": 13150 + }, + { + "epoch": 4.854664041348757, + "grad_norm": 0.9399644136428833, + "learning_rate": 4.4022646791795466e-07, + "loss": 0.5335, + "step": 13151 + }, + { + "epoch": 4.855033226679793, + "grad_norm": 0.9344872236251831, + "learning_rate": 4.399508677875472e-07, + "loss": 0.5854, + "step": 13152 + }, + { + "epoch": 4.85540241201083, + "grad_norm": 0.959585964679718, + "learning_rate": 4.396753456293501e-07, + "loss": 0.5634, + "step": 13153 + }, + { + "epoch": 4.855771597341866, + "grad_norm": 0.9953871965408325, + "learning_rate": 4.3939990145379085e-07, + "loss": 0.6044, + "step": 13154 + }, + { + "epoch": 4.8561407826729015, + "grad_norm": 0.9555123448371887, + "learning_rate": 4.39124535271295e-07, + "loss": 0.5847, + "step": 13155 + }, + { + "epoch": 4.856509968003938, + "grad_norm": 0.9763908982276917, + "learning_rate": 4.388492470922845e-07, + "loss": 0.55, + "step": 13156 + }, + { + "epoch": 4.856879153334974, + "grad_norm": 0.925216555595398, + "learning_rate": 4.3857403692717933e-07, + "loss": 0.5689, + "step": 13157 + }, + { + "epoch": 4.85724833866601, + "grad_norm": 0.9551888108253479, + "learning_rate": 4.382989047863978e-07, + "loss": 0.566, + "step": 13158 + }, + { + "epoch": 4.857617523997046, + "grad_norm": 0.9885072112083435, + "learning_rate": 4.380238506803505e-07, + "loss": 0.593, + "step": 13159 + }, + { + "epoch": 4.857986709328083, + "grad_norm": 1.012276291847229, + "learning_rate": 4.3774887461944997e-07, + "loss": 0.5996, + "step": 13160 + }, + { + "epoch": 4.858355894659119, + "grad_norm": 0.9784184098243713, + "learning_rate": 4.3747397661410395e-07, + "loss": 0.5591, + "step": 13161 + }, + { + "epoch": 4.858725079990155, + "grad_norm": 0.9325795769691467, + "learning_rate": 4.371991566747172e-07, + "loss": 0.5718, + "step": 13162 + }, + { + "epoch": 4.8590942653211915, + "grad_norm": 0.9691129326820374, + "learning_rate": 4.3692441481169134e-07, + "loss": 0.5664, + "step": 13163 + }, + { + "epoch": 4.859463450652227, + "grad_norm": 0.9362803101539612, + "learning_rate": 4.366497510354245e-07, + "loss": 0.5616, + "step": 13164 + }, + { + "epoch": 4.859832635983263, + "grad_norm": 0.9132379293441772, + "learning_rate": 4.3637516535631365e-07, + "loss": 0.5182, + "step": 13165 + }, + { + "epoch": 4.8602018213143, + "grad_norm": 0.9344361424446106, + "learning_rate": 4.3610065778475236e-07, + "loss": 0.5757, + "step": 13166 + }, + { + "epoch": 4.860571006645336, + "grad_norm": 0.947309672832489, + "learning_rate": 4.3582622833112904e-07, + "loss": 0.562, + "step": 13167 + }, + { + "epoch": 4.860940191976372, + "grad_norm": 0.9303807616233826, + "learning_rate": 4.3555187700583175e-07, + "loss": 0.5906, + "step": 13168 + }, + { + "epoch": 4.861309377307409, + "grad_norm": 0.9692209362983704, + "learning_rate": 4.352776038192438e-07, + "loss": 0.5445, + "step": 13169 + }, + { + "epoch": 4.861678562638445, + "grad_norm": 0.9420490860939026, + "learning_rate": 4.3500340878174756e-07, + "loss": 0.5606, + "step": 13170 + }, + { + "epoch": 4.862047747969481, + "grad_norm": 0.9342072010040283, + "learning_rate": 4.347292919037208e-07, + "loss": 0.58, + "step": 13171 + }, + { + "epoch": 4.8624169333005165, + "grad_norm": 0.9761987924575806, + "learning_rate": 4.344552531955376e-07, + "loss": 0.575, + "step": 13172 + }, + { + "epoch": 4.862786118631553, + "grad_norm": 0.948101818561554, + "learning_rate": 4.341812926675718e-07, + "loss": 0.5606, + "step": 13173 + }, + { + "epoch": 4.863155303962589, + "grad_norm": 0.9216741323471069, + "learning_rate": 4.339074103301921e-07, + "loss": 0.5555, + "step": 13174 + }, + { + "epoch": 4.863524489293625, + "grad_norm": 0.9297271370887756, + "learning_rate": 4.336336061937646e-07, + "loss": 0.5557, + "step": 13175 + }, + { + "epoch": 4.863893674624662, + "grad_norm": 0.9459009170532227, + "learning_rate": 4.333598802686531e-07, + "loss": 0.5723, + "step": 13176 + }, + { + "epoch": 4.864262859955698, + "grad_norm": 0.9792972803115845, + "learning_rate": 4.330862325652174e-07, + "loss": 0.5868, + "step": 13177 + }, + { + "epoch": 4.864632045286734, + "grad_norm": 0.9550926685333252, + "learning_rate": 4.3281266309381705e-07, + "loss": 0.5586, + "step": 13178 + }, + { + "epoch": 4.86500123061777, + "grad_norm": 0.9351836442947388, + "learning_rate": 4.3253917186480337e-07, + "loss": 0.5565, + "step": 13179 + }, + { + "epoch": 4.8653704159488065, + "grad_norm": 0.9721566438674927, + "learning_rate": 4.322657588885296e-07, + "loss": 0.5343, + "step": 13180 + }, + { + "epoch": 4.865739601279842, + "grad_norm": 0.9306585788726807, + "learning_rate": 4.319924241753448e-07, + "loss": 0.5561, + "step": 13181 + }, + { + "epoch": 4.866108786610878, + "grad_norm": 0.943486213684082, + "learning_rate": 4.3171916773559417e-07, + "loss": 0.5718, + "step": 13182 + }, + { + "epoch": 4.866477971941915, + "grad_norm": 0.9633145928382874, + "learning_rate": 4.3144598957962054e-07, + "loss": 0.5305, + "step": 13183 + }, + { + "epoch": 4.866847157272951, + "grad_norm": 0.9619407057762146, + "learning_rate": 4.311728897177625e-07, + "loss": 0.5404, + "step": 13184 + }, + { + "epoch": 4.867216342603987, + "grad_norm": 0.9970574975013733, + "learning_rate": 4.3089986816035766e-07, + "loss": 0.566, + "step": 13185 + }, + { + "epoch": 4.867585527935024, + "grad_norm": 0.9647724628448486, + "learning_rate": 4.306269249177408e-07, + "loss": 0.5496, + "step": 13186 + }, + { + "epoch": 4.86795471326606, + "grad_norm": 0.9781527519226074, + "learning_rate": 4.303540600002409e-07, + "loss": 0.5603, + "step": 13187 + }, + { + "epoch": 4.868323898597096, + "grad_norm": 0.9956498742103577, + "learning_rate": 4.3008127341818695e-07, + "loss": 0.5868, + "step": 13188 + }, + { + "epoch": 4.868693083928132, + "grad_norm": 0.9556920528411865, + "learning_rate": 4.298085651819031e-07, + "loss": 0.5575, + "step": 13189 + }, + { + "epoch": 4.869062269259168, + "grad_norm": 0.9751665592193604, + "learning_rate": 4.295359353017123e-07, + "loss": 0.549, + "step": 13190 + }, + { + "epoch": 4.869431454590204, + "grad_norm": 0.9349479675292969, + "learning_rate": 4.292633837879329e-07, + "loss": 0.5583, + "step": 13191 + }, + { + "epoch": 4.86980063992124, + "grad_norm": 0.9871910810470581, + "learning_rate": 4.2899091065088023e-07, + "loss": 0.5634, + "step": 13192 + }, + { + "epoch": 4.870169825252277, + "grad_norm": 0.9479964971542358, + "learning_rate": 4.287185159008686e-07, + "loss": 0.5868, + "step": 13193 + }, + { + "epoch": 4.870539010583313, + "grad_norm": 0.963447093963623, + "learning_rate": 4.2844619954820727e-07, + "loss": 0.5642, + "step": 13194 + }, + { + "epoch": 4.870908195914349, + "grad_norm": 0.9673013091087341, + "learning_rate": 4.2817396160320294e-07, + "loss": 0.5716, + "step": 13195 + }, + { + "epoch": 4.871277381245386, + "grad_norm": 0.9745810031890869, + "learning_rate": 4.2790180207616106e-07, + "loss": 0.5473, + "step": 13196 + }, + { + "epoch": 4.8716465665764215, + "grad_norm": 0.9448627829551697, + "learning_rate": 4.276297209773814e-07, + "loss": 0.5982, + "step": 13197 + }, + { + "epoch": 4.872015751907457, + "grad_norm": 0.978668212890625, + "learning_rate": 4.2735771831716314e-07, + "loss": 0.6023, + "step": 13198 + }, + { + "epoch": 4.872384937238493, + "grad_norm": 0.9765504598617554, + "learning_rate": 4.2708579410580123e-07, + "loss": 0.5574, + "step": 13199 + }, + { + "epoch": 4.87275412256953, + "grad_norm": 0.9769113063812256, + "learning_rate": 4.268139483535874e-07, + "loss": 0.5684, + "step": 13200 + }, + { + "epoch": 4.873123307900566, + "grad_norm": 0.9828941226005554, + "learning_rate": 4.265421810708117e-07, + "loss": 0.5463, + "step": 13201 + }, + { + "epoch": 4.873492493231602, + "grad_norm": 1.0449920892715454, + "learning_rate": 4.262704922677602e-07, + "loss": 0.5305, + "step": 13202 + }, + { + "epoch": 4.873861678562639, + "grad_norm": 0.9703594446182251, + "learning_rate": 4.2599888195471534e-07, + "loss": 0.5939, + "step": 13203 + }, + { + "epoch": 4.874230863893675, + "grad_norm": 0.9276857972145081, + "learning_rate": 4.257273501419587e-07, + "loss": 0.5692, + "step": 13204 + }, + { + "epoch": 4.874600049224711, + "grad_norm": 0.9944252371788025, + "learning_rate": 4.2545589683976677e-07, + "loss": 0.569, + "step": 13205 + }, + { + "epoch": 4.874969234555747, + "grad_norm": 0.971705973148346, + "learning_rate": 4.251845220584155e-07, + "loss": 0.579, + "step": 13206 + }, + { + "epoch": 4.875338419886783, + "grad_norm": 0.946364164352417, + "learning_rate": 4.249132258081742e-07, + "loss": 0.5533, + "step": 13207 + }, + { + "epoch": 4.875707605217819, + "grad_norm": 0.9541913270950317, + "learning_rate": 4.2464200809931244e-07, + "loss": 0.6289, + "step": 13208 + }, + { + "epoch": 4.876076790548856, + "grad_norm": 0.9936672449111938, + "learning_rate": 4.243708689420961e-07, + "loss": 0.5696, + "step": 13209 + }, + { + "epoch": 4.876445975879892, + "grad_norm": 0.9584001302719116, + "learning_rate": 4.240998083467876e-07, + "loss": 0.5539, + "step": 13210 + }, + { + "epoch": 4.876815161210928, + "grad_norm": 0.9746771454811096, + "learning_rate": 4.238288263236459e-07, + "loss": 0.592, + "step": 13211 + }, + { + "epoch": 4.877184346541964, + "grad_norm": 0.9687290787696838, + "learning_rate": 4.235579228829273e-07, + "loss": 0.5533, + "step": 13212 + }, + { + "epoch": 4.877553531873001, + "grad_norm": 0.946291983127594, + "learning_rate": 4.232870980348866e-07, + "loss": 0.5915, + "step": 13213 + }, + { + "epoch": 4.8779227172040365, + "grad_norm": 0.94402676820755, + "learning_rate": 4.23016351789774e-07, + "loss": 0.5994, + "step": 13214 + }, + { + "epoch": 4.878291902535072, + "grad_norm": 0.9689635634422302, + "learning_rate": 4.227456841578362e-07, + "loss": 0.5881, + "step": 13215 + }, + { + "epoch": 4.878661087866108, + "grad_norm": 0.9691056609153748, + "learning_rate": 4.224750951493192e-07, + "loss": 0.5808, + "step": 13216 + }, + { + "epoch": 4.879030273197145, + "grad_norm": 0.9694831371307373, + "learning_rate": 4.222045847744635e-07, + "loss": 0.6167, + "step": 13217 + }, + { + "epoch": 4.879399458528181, + "grad_norm": 0.9672718048095703, + "learning_rate": 4.219341530435095e-07, + "loss": 0.5729, + "step": 13218 + }, + { + "epoch": 4.879768643859217, + "grad_norm": 0.9988065958023071, + "learning_rate": 4.216637999666917e-07, + "loss": 0.5469, + "step": 13219 + }, + { + "epoch": 4.880137829190254, + "grad_norm": 0.9549050331115723, + "learning_rate": 4.213935255542426e-07, + "loss": 0.5749, + "step": 13220 + }, + { + "epoch": 4.88050701452129, + "grad_norm": 0.9386558532714844, + "learning_rate": 4.21123329816393e-07, + "loss": 0.5926, + "step": 13221 + }, + { + "epoch": 4.880876199852326, + "grad_norm": 0.9718097448348999, + "learning_rate": 4.208532127633694e-07, + "loss": 0.5558, + "step": 13222 + }, + { + "epoch": 4.881245385183362, + "grad_norm": 0.9519214630126953, + "learning_rate": 4.2058317440539484e-07, + "loss": 0.532, + "step": 13223 + }, + { + "epoch": 4.881614570514398, + "grad_norm": 0.932725727558136, + "learning_rate": 4.203132147526914e-07, + "loss": 0.5398, + "step": 13224 + }, + { + "epoch": 4.881983755845434, + "grad_norm": 0.9672147631645203, + "learning_rate": 4.2004333381547606e-07, + "loss": 0.5682, + "step": 13225 + }, + { + "epoch": 4.882352941176471, + "grad_norm": 0.9550067186355591, + "learning_rate": 4.19773531603965e-07, + "loss": 0.607, + "step": 13226 + }, + { + "epoch": 4.882722126507507, + "grad_norm": 0.9706442356109619, + "learning_rate": 4.195038081283684e-07, + "loss": 0.5854, + "step": 13227 + }, + { + "epoch": 4.883091311838543, + "grad_norm": 0.9771723747253418, + "learning_rate": 4.192341633988958e-07, + "loss": 0.5492, + "step": 13228 + }, + { + "epoch": 4.883460497169579, + "grad_norm": 0.9283353090286255, + "learning_rate": 4.189645974257539e-07, + "loss": 0.5654, + "step": 13229 + }, + { + "epoch": 4.8838296825006156, + "grad_norm": 0.9703173041343689, + "learning_rate": 4.1869511021914527e-07, + "loss": 0.5539, + "step": 13230 + }, + { + "epoch": 4.8841988678316515, + "grad_norm": 0.9780389070510864, + "learning_rate": 4.184257017892696e-07, + "loss": 0.5549, + "step": 13231 + }, + { + "epoch": 4.884568053162687, + "grad_norm": 0.9616702198982239, + "learning_rate": 4.181563721463244e-07, + "loss": 0.6183, + "step": 13232 + }, + { + "epoch": 4.884937238493724, + "grad_norm": 0.9687731266021729, + "learning_rate": 4.178871213005034e-07, + "loss": 0.5946, + "step": 13233 + }, + { + "epoch": 4.88530642382476, + "grad_norm": 0.9759718775749207, + "learning_rate": 4.1761794926199807e-07, + "loss": 0.6381, + "step": 13234 + }, + { + "epoch": 4.885675609155796, + "grad_norm": 0.9545261859893799, + "learning_rate": 4.173488560409952e-07, + "loss": 0.5805, + "step": 13235 + }, + { + "epoch": 4.886044794486832, + "grad_norm": 0.9795812368392944, + "learning_rate": 4.1707984164768123e-07, + "loss": 0.5768, + "step": 13236 + }, + { + "epoch": 4.886413979817869, + "grad_norm": 0.9566190838813782, + "learning_rate": 4.1681090609223817e-07, + "loss": 0.6113, + "step": 13237 + }, + { + "epoch": 4.886783165148905, + "grad_norm": 0.966839611530304, + "learning_rate": 4.165420493848449e-07, + "loss": 0.5789, + "step": 13238 + }, + { + "epoch": 4.887152350479941, + "grad_norm": 0.9688395857810974, + "learning_rate": 4.1627327153567763e-07, + "loss": 0.5719, + "step": 13239 + }, + { + "epoch": 4.887521535810977, + "grad_norm": 0.9483798146247864, + "learning_rate": 4.1600457255490863e-07, + "loss": 0.5892, + "step": 13240 + }, + { + "epoch": 4.887890721142013, + "grad_norm": 0.961580216884613, + "learning_rate": 4.1573595245270965e-07, + "loss": 0.5724, + "step": 13241 + }, + { + "epoch": 4.888259906473049, + "grad_norm": 0.9550166130065918, + "learning_rate": 4.1546741123924676e-07, + "loss": 0.5614, + "step": 13242 + }, + { + "epoch": 4.888629091804086, + "grad_norm": 0.9651317000389099, + "learning_rate": 4.151989489246841e-07, + "loss": 0.5834, + "step": 13243 + }, + { + "epoch": 4.888998277135122, + "grad_norm": 0.95052170753479, + "learning_rate": 4.1493056551918377e-07, + "loss": 0.5973, + "step": 13244 + }, + { + "epoch": 4.889367462466158, + "grad_norm": 0.971348762512207, + "learning_rate": 4.146622610329029e-07, + "loss": 0.5472, + "step": 13245 + }, + { + "epoch": 4.889736647797195, + "grad_norm": 0.9725842475891113, + "learning_rate": 4.1439403547599786e-07, + "loss": 0.5458, + "step": 13246 + }, + { + "epoch": 4.8901058331282306, + "grad_norm": 0.9840410947799683, + "learning_rate": 4.1412588885862055e-07, + "loss": 0.557, + "step": 13247 + }, + { + "epoch": 4.8904750184592665, + "grad_norm": 0.9636935591697693, + "learning_rate": 4.1385782119091917e-07, + "loss": 0.5913, + "step": 13248 + }, + { + "epoch": 4.890844203790302, + "grad_norm": 0.9178866147994995, + "learning_rate": 4.135898324830417e-07, + "loss": 0.5748, + "step": 13249 + }, + { + "epoch": 4.891213389121339, + "grad_norm": 0.9770238399505615, + "learning_rate": 4.133219227451304e-07, + "loss": 0.5836, + "step": 13250 + }, + { + "epoch": 4.891582574452375, + "grad_norm": 0.9391356706619263, + "learning_rate": 4.130540919873252e-07, + "loss": 0.5812, + "step": 13251 + }, + { + "epoch": 4.891951759783411, + "grad_norm": 0.9544983506202698, + "learning_rate": 4.127863402197646e-07, + "loss": 0.5308, + "step": 13252 + }, + { + "epoch": 4.892320945114448, + "grad_norm": 0.9346486330032349, + "learning_rate": 4.1251866745258243e-07, + "loss": 0.6055, + "step": 13253 + }, + { + "epoch": 4.892690130445484, + "grad_norm": 0.952747642993927, + "learning_rate": 4.1225107369590954e-07, + "loss": 0.5704, + "step": 13254 + }, + { + "epoch": 4.89305931577652, + "grad_norm": 0.9600462317466736, + "learning_rate": 4.119835589598742e-07, + "loss": 0.5308, + "step": 13255 + }, + { + "epoch": 4.893428501107556, + "grad_norm": 0.9821571111679077, + "learning_rate": 4.1171612325460244e-07, + "loss": 0.5303, + "step": 13256 + }, + { + "epoch": 4.893797686438592, + "grad_norm": 0.896121084690094, + "learning_rate": 4.1144876659021644e-07, + "loss": 0.5487, + "step": 13257 + }, + { + "epoch": 4.894166871769628, + "grad_norm": 0.9564840793609619, + "learning_rate": 4.111814889768356e-07, + "loss": 0.5532, + "step": 13258 + }, + { + "epoch": 4.894536057100664, + "grad_norm": 0.9558310508728027, + "learning_rate": 4.109142904245764e-07, + "loss": 0.6153, + "step": 13259 + }, + { + "epoch": 4.894905242431701, + "grad_norm": 1.0082658529281616, + "learning_rate": 4.106471709435514e-07, + "loss": 0.574, + "step": 13260 + }, + { + "epoch": 4.895274427762737, + "grad_norm": 0.9526282548904419, + "learning_rate": 4.1038013054387175e-07, + "loss": 0.5783, + "step": 13261 + }, + { + "epoch": 4.895643613093773, + "grad_norm": 0.9427253603935242, + "learning_rate": 4.1011316923564496e-07, + "loss": 0.5489, + "step": 13262 + }, + { + "epoch": 4.89601279842481, + "grad_norm": 0.9460694193840027, + "learning_rate": 4.098462870289746e-07, + "loss": 0.5541, + "step": 13263 + }, + { + "epoch": 4.8963819837558455, + "grad_norm": 0.9501973986625671, + "learning_rate": 4.095794839339631e-07, + "loss": 0.5938, + "step": 13264 + }, + { + "epoch": 4.8967511690868815, + "grad_norm": 0.9941124320030212, + "learning_rate": 4.0931275996070774e-07, + "loss": 0.6159, + "step": 13265 + }, + { + "epoch": 4.897120354417918, + "grad_norm": 0.9364920854568481, + "learning_rate": 4.0904611511930535e-07, + "loss": 0.5552, + "step": 13266 + }, + { + "epoch": 4.897489539748954, + "grad_norm": 0.9335883259773254, + "learning_rate": 4.087795494198474e-07, + "loss": 0.5659, + "step": 13267 + }, + { + "epoch": 4.89785872507999, + "grad_norm": 0.956126868724823, + "learning_rate": 4.085130628724229e-07, + "loss": 0.5846, + "step": 13268 + }, + { + "epoch": 4.898227910411026, + "grad_norm": 0.9544212222099304, + "learning_rate": 4.0824665548711976e-07, + "loss": 0.5981, + "step": 13269 + }, + { + "epoch": 4.898597095742063, + "grad_norm": 0.9674879908561707, + "learning_rate": 4.0798032727402013e-07, + "loss": 0.5653, + "step": 13270 + }, + { + "epoch": 4.898966281073099, + "grad_norm": 0.9573246240615845, + "learning_rate": 4.0771407824320456e-07, + "loss": 0.5611, + "step": 13271 + }, + { + "epoch": 4.899335466404135, + "grad_norm": 0.9496637582778931, + "learning_rate": 4.074479084047514e-07, + "loss": 0.5957, + "step": 13272 + }, + { + "epoch": 4.899704651735171, + "grad_norm": 1.006679654121399, + "learning_rate": 4.071818177687345e-07, + "loss": 0.5853, + "step": 13273 + }, + { + "epoch": 4.900073837066207, + "grad_norm": 0.9461730122566223, + "learning_rate": 4.0691580634522445e-07, + "loss": 0.5704, + "step": 13274 + }, + { + "epoch": 4.900443022397243, + "grad_norm": 0.964617133140564, + "learning_rate": 4.0664987414429145e-07, + "loss": 0.5781, + "step": 13275 + }, + { + "epoch": 4.900812207728279, + "grad_norm": 0.9849216341972351, + "learning_rate": 4.0638402117599925e-07, + "loss": 0.5912, + "step": 13276 + }, + { + "epoch": 4.901181393059316, + "grad_norm": 0.9737285375595093, + "learning_rate": 4.061182474504116e-07, + "loss": 0.6335, + "step": 13277 + }, + { + "epoch": 4.901550578390352, + "grad_norm": 0.974108099937439, + "learning_rate": 4.058525529775875e-07, + "loss": 0.604, + "step": 13278 + }, + { + "epoch": 4.901919763721388, + "grad_norm": 0.9394720196723938, + "learning_rate": 4.055869377675831e-07, + "loss": 0.554, + "step": 13279 + }, + { + "epoch": 4.902288949052425, + "grad_norm": 0.9680590629577637, + "learning_rate": 4.0532140183045215e-07, + "loss": 0.583, + "step": 13280 + }, + { + "epoch": 4.9026581343834605, + "grad_norm": 0.9729242920875549, + "learning_rate": 4.0505594517624533e-07, + "loss": 0.5587, + "step": 13281 + }, + { + "epoch": 4.9030273197144965, + "grad_norm": 1.0272668600082397, + "learning_rate": 4.047905678150099e-07, + "loss": 0.5626, + "step": 13282 + }, + { + "epoch": 4.903396505045533, + "grad_norm": 0.9302543997764587, + "learning_rate": 4.0452526975678945e-07, + "loss": 0.5653, + "step": 13283 + }, + { + "epoch": 4.903765690376569, + "grad_norm": 0.9536417126655579, + "learning_rate": 4.0426005101162624e-07, + "loss": 0.6049, + "step": 13284 + }, + { + "epoch": 4.904134875707605, + "grad_norm": 0.9742435216903687, + "learning_rate": 4.039949115895594e-07, + "loss": 0.6171, + "step": 13285 + }, + { + "epoch": 4.904504061038642, + "grad_norm": 0.9555721879005432, + "learning_rate": 4.0372985150062354e-07, + "loss": 0.5792, + "step": 13286 + }, + { + "epoch": 4.904873246369678, + "grad_norm": 0.9532057046890259, + "learning_rate": 4.034648707548514e-07, + "loss": 0.6075, + "step": 13287 + }, + { + "epoch": 4.905242431700714, + "grad_norm": 0.9602806568145752, + "learning_rate": 4.0319996936227123e-07, + "loss": 0.6128, + "step": 13288 + }, + { + "epoch": 4.90561161703175, + "grad_norm": 0.9851533770561218, + "learning_rate": 4.0293514733291127e-07, + "loss": 0.5703, + "step": 13289 + }, + { + "epoch": 4.905980802362786, + "grad_norm": 0.9653774499893188, + "learning_rate": 4.026704046767943e-07, + "loss": 0.5637, + "step": 13290 + }, + { + "epoch": 4.906349987693822, + "grad_norm": 0.9621008038520813, + "learning_rate": 4.0240574140393977e-07, + "loss": 0.5697, + "step": 13291 + }, + { + "epoch": 4.906719173024858, + "grad_norm": 0.9529480934143066, + "learning_rate": 4.0214115752436664e-07, + "loss": 0.5509, + "step": 13292 + }, + { + "epoch": 4.907088358355894, + "grad_norm": 0.95651775598526, + "learning_rate": 4.018766530480878e-07, + "loss": 0.5941, + "step": 13293 + }, + { + "epoch": 4.907457543686931, + "grad_norm": 0.9896916747093201, + "learning_rate": 4.016122279851162e-07, + "loss": 0.5949, + "step": 13294 + }, + { + "epoch": 4.907826729017967, + "grad_norm": 0.983445405960083, + "learning_rate": 4.0134788234545945e-07, + "loss": 0.5925, + "step": 13295 + }, + { + "epoch": 4.908195914349003, + "grad_norm": 0.9355384111404419, + "learning_rate": 4.0108361613912265e-07, + "loss": 0.5586, + "step": 13296 + }, + { + "epoch": 4.90856509968004, + "grad_norm": 0.951873242855072, + "learning_rate": 4.008194293761089e-07, + "loss": 0.5762, + "step": 13297 + }, + { + "epoch": 4.9089342850110755, + "grad_norm": 0.9550222754478455, + "learning_rate": 4.005553220664174e-07, + "loss": 0.5654, + "step": 13298 + }, + { + "epoch": 4.9093034703421115, + "grad_norm": 0.9687116742134094, + "learning_rate": 4.0029129422004347e-07, + "loss": 0.5945, + "step": 13299 + }, + { + "epoch": 4.909672655673148, + "grad_norm": 0.9478840827941895, + "learning_rate": 4.000273458469822e-07, + "loss": 0.5796, + "step": 13300 + }, + { + "epoch": 4.910041841004184, + "grad_norm": 0.9600197076797485, + "learning_rate": 3.9976347695722313e-07, + "loss": 0.5648, + "step": 13301 + }, + { + "epoch": 4.91041102633522, + "grad_norm": 0.959915280342102, + "learning_rate": 3.9949968756075373e-07, + "loss": 0.6046, + "step": 13302 + }, + { + "epoch": 4.910780211666257, + "grad_norm": 0.9785735607147217, + "learning_rate": 3.992359776675575e-07, + "loss": 0.5673, + "step": 13303 + }, + { + "epoch": 4.911149396997293, + "grad_norm": 0.9787002205848694, + "learning_rate": 3.989723472876164e-07, + "loss": 0.5317, + "step": 13304 + }, + { + "epoch": 4.911518582328329, + "grad_norm": 0.9509599208831787, + "learning_rate": 3.987087964309097e-07, + "loss": 0.5934, + "step": 13305 + }, + { + "epoch": 4.911887767659365, + "grad_norm": 0.9651609063148499, + "learning_rate": 3.984453251074116e-07, + "loss": 0.5956, + "step": 13306 + }, + { + "epoch": 4.912256952990401, + "grad_norm": 0.9794147610664368, + "learning_rate": 3.981819333270942e-07, + "loss": 0.576, + "step": 13307 + }, + { + "epoch": 4.912626138321437, + "grad_norm": 0.9347951412200928, + "learning_rate": 3.979186210999278e-07, + "loss": 0.592, + "step": 13308 + }, + { + "epoch": 4.912995323652473, + "grad_norm": 0.9652244448661804, + "learning_rate": 3.976553884358783e-07, + "loss": 0.5713, + "step": 13309 + }, + { + "epoch": 4.91336450898351, + "grad_norm": 0.9578408598899841, + "learning_rate": 3.973922353449089e-07, + "loss": 0.5746, + "step": 13310 + }, + { + "epoch": 4.913733694314546, + "grad_norm": 0.9633655548095703, + "learning_rate": 3.971291618369791e-07, + "loss": 0.6266, + "step": 13311 + }, + { + "epoch": 4.914102879645582, + "grad_norm": 0.9950439929962158, + "learning_rate": 3.9686616792204677e-07, + "loss": 0.5888, + "step": 13312 + }, + { + "epoch": 4.914472064976618, + "grad_norm": 0.967655599117279, + "learning_rate": 3.9660325361006694e-07, + "loss": 0.5677, + "step": 13313 + }, + { + "epoch": 4.914841250307655, + "grad_norm": 0.969683051109314, + "learning_rate": 3.963404189109901e-07, + "loss": 0.5959, + "step": 13314 + }, + { + "epoch": 4.9152104356386905, + "grad_norm": 0.9177119135856628, + "learning_rate": 3.960776638347644e-07, + "loss": 0.5388, + "step": 13315 + }, + { + "epoch": 4.9155796209697264, + "grad_norm": 0.9678689241409302, + "learning_rate": 3.9581498839133455e-07, + "loss": 0.5787, + "step": 13316 + }, + { + "epoch": 4.915948806300763, + "grad_norm": 0.960037350654602, + "learning_rate": 3.9555239259064403e-07, + "loss": 0.5563, + "step": 13317 + }, + { + "epoch": 4.916317991631799, + "grad_norm": 0.9581636786460876, + "learning_rate": 3.9528987644263106e-07, + "loss": 0.5708, + "step": 13318 + }, + { + "epoch": 4.916687176962835, + "grad_norm": 0.9776982069015503, + "learning_rate": 3.9502743995723155e-07, + "loss": 0.6144, + "step": 13319 + }, + { + "epoch": 4.917056362293872, + "grad_norm": 0.9786728620529175, + "learning_rate": 3.947650831443797e-07, + "loss": 0.5609, + "step": 13320 + }, + { + "epoch": 4.917425547624908, + "grad_norm": 0.9859618544578552, + "learning_rate": 3.945028060140049e-07, + "loss": 0.5771, + "step": 13321 + }, + { + "epoch": 4.917794732955944, + "grad_norm": 0.9429854154586792, + "learning_rate": 3.942406085760339e-07, + "loss": 0.5658, + "step": 13322 + }, + { + "epoch": 4.9181639182869805, + "grad_norm": 0.9925432801246643, + "learning_rate": 3.939784908403918e-07, + "loss": 0.552, + "step": 13323 + }, + { + "epoch": 4.918533103618016, + "grad_norm": 0.9810221791267395, + "learning_rate": 3.937164528169987e-07, + "loss": 0.5859, + "step": 13324 + }, + { + "epoch": 4.918902288949052, + "grad_norm": 0.9608479738235474, + "learning_rate": 3.934544945157737e-07, + "loss": 0.5558, + "step": 13325 + }, + { + "epoch": 4.919271474280088, + "grad_norm": 0.9645576477050781, + "learning_rate": 3.931926159466312e-07, + "loss": 0.5913, + "step": 13326 + }, + { + "epoch": 4.919640659611125, + "grad_norm": 0.9641848802566528, + "learning_rate": 3.929308171194826e-07, + "loss": 0.5825, + "step": 13327 + }, + { + "epoch": 4.920009844942161, + "grad_norm": 0.9608797430992126, + "learning_rate": 3.9266909804423825e-07, + "loss": 0.569, + "step": 13328 + }, + { + "epoch": 4.920379030273197, + "grad_norm": 0.9704278707504272, + "learning_rate": 3.924074587308033e-07, + "loss": 0.5517, + "step": 13329 + }, + { + "epoch": 4.920748215604234, + "grad_norm": 0.9328110814094543, + "learning_rate": 3.9214589918908113e-07, + "loss": 0.5466, + "step": 13330 + }, + { + "epoch": 4.92111740093527, + "grad_norm": 0.963100790977478, + "learning_rate": 3.918844194289706e-07, + "loss": 0.5629, + "step": 13331 + }, + { + "epoch": 4.9214865862663055, + "grad_norm": 0.9539852142333984, + "learning_rate": 3.9162301946036956e-07, + "loss": 0.5892, + "step": 13332 + }, + { + "epoch": 4.9218557715973414, + "grad_norm": 0.9856855869293213, + "learning_rate": 3.9136169929317234e-07, + "loss": 0.5328, + "step": 13333 + }, + { + "epoch": 4.922224956928378, + "grad_norm": 0.9512057304382324, + "learning_rate": 3.911004589372694e-07, + "loss": 0.5792, + "step": 13334 + }, + { + "epoch": 4.922594142259414, + "grad_norm": 0.9717251658439636, + "learning_rate": 3.9083929840254856e-07, + "loss": 0.5906, + "step": 13335 + }, + { + "epoch": 4.92296332759045, + "grad_norm": 0.9683064818382263, + "learning_rate": 3.9057821769889396e-07, + "loss": 0.5763, + "step": 13336 + }, + { + "epoch": 4.923332512921487, + "grad_norm": 0.9534940123558044, + "learning_rate": 3.903172168361885e-07, + "loss": 0.5392, + "step": 13337 + }, + { + "epoch": 4.923701698252523, + "grad_norm": 1.001999855041504, + "learning_rate": 3.9005629582431065e-07, + "loss": 0.5673, + "step": 13338 + }, + { + "epoch": 4.924070883583559, + "grad_norm": 0.9801926612854004, + "learning_rate": 3.897954546731356e-07, + "loss": 0.5698, + "step": 13339 + }, + { + "epoch": 4.9244400689145955, + "grad_norm": 0.9661502838134766, + "learning_rate": 3.895346933925373e-07, + "loss": 0.5637, + "step": 13340 + }, + { + "epoch": 4.924809254245631, + "grad_norm": 0.9389491081237793, + "learning_rate": 3.8927401199238463e-07, + "loss": 0.567, + "step": 13341 + }, + { + "epoch": 4.925178439576667, + "grad_norm": 0.9833522439002991, + "learning_rate": 3.890134104825438e-07, + "loss": 0.5803, + "step": 13342 + }, + { + "epoch": 4.925547624907704, + "grad_norm": 0.9532436728477478, + "learning_rate": 3.8875288887287976e-07, + "loss": 0.5738, + "step": 13343 + }, + { + "epoch": 4.92591681023874, + "grad_norm": 0.9377341270446777, + "learning_rate": 3.8849244717325206e-07, + "loss": 0.608, + "step": 13344 + }, + { + "epoch": 4.926285995569776, + "grad_norm": 0.9906229972839355, + "learning_rate": 3.8823208539351925e-07, + "loss": 0.5884, + "step": 13345 + }, + { + "epoch": 4.926655180900812, + "grad_norm": 0.9860050678253174, + "learning_rate": 3.879718035435356e-07, + "loss": 0.5408, + "step": 13346 + }, + { + "epoch": 4.927024366231849, + "grad_norm": 0.9442058801651001, + "learning_rate": 3.8771160163315226e-07, + "loss": 0.5575, + "step": 13347 + }, + { + "epoch": 4.927393551562885, + "grad_norm": 0.9485129117965698, + "learning_rate": 3.8745147967221843e-07, + "loss": 0.5629, + "step": 13348 + }, + { + "epoch": 4.9277627368939205, + "grad_norm": 0.9915307760238647, + "learning_rate": 3.871914376705796e-07, + "loss": 0.5997, + "step": 13349 + }, + { + "epoch": 4.928131922224956, + "grad_norm": 0.9536213278770447, + "learning_rate": 3.869314756380771e-07, + "loss": 0.5657, + "step": 13350 + }, + { + "epoch": 4.928501107555993, + "grad_norm": 0.9705127477645874, + "learning_rate": 3.8667159358455217e-07, + "loss": 0.5999, + "step": 13351 + }, + { + "epoch": 4.928870292887029, + "grad_norm": 0.9503833055496216, + "learning_rate": 3.8641179151984007e-07, + "loss": 0.5702, + "step": 13352 + }, + { + "epoch": 4.929239478218065, + "grad_norm": 0.9661746025085449, + "learning_rate": 3.8615206945377486e-07, + "loss": 0.5694, + "step": 13353 + }, + { + "epoch": 4.929608663549102, + "grad_norm": 0.9615142345428467, + "learning_rate": 3.85892427396187e-07, + "loss": 0.5751, + "step": 13354 + }, + { + "epoch": 4.929977848880138, + "grad_norm": 0.9901025295257568, + "learning_rate": 3.8563286535690274e-07, + "loss": 0.5621, + "step": 13355 + }, + { + "epoch": 4.930347034211174, + "grad_norm": 0.9669297933578491, + "learning_rate": 3.8537338334574763e-07, + "loss": 0.5489, + "step": 13356 + }, + { + "epoch": 4.9307162195422105, + "grad_norm": 1.0849553346633911, + "learning_rate": 3.8511398137254296e-07, + "loss": 0.5822, + "step": 13357 + }, + { + "epoch": 4.931085404873246, + "grad_norm": 0.9379593729972839, + "learning_rate": 3.848546594471064e-07, + "loss": 0.5382, + "step": 13358 + }, + { + "epoch": 4.931454590204282, + "grad_norm": 0.9796071648597717, + "learning_rate": 3.845954175792527e-07, + "loss": 0.5574, + "step": 13359 + }, + { + "epoch": 4.931823775535319, + "grad_norm": 0.951331615447998, + "learning_rate": 3.8433625577879495e-07, + "loss": 0.556, + "step": 13360 + }, + { + "epoch": 4.932192960866355, + "grad_norm": 0.9506223201751709, + "learning_rate": 3.840771740555435e-07, + "loss": 0.5821, + "step": 13361 + }, + { + "epoch": 4.932562146197391, + "grad_norm": 0.9854286909103394, + "learning_rate": 3.8381817241930216e-07, + "loss": 0.5412, + "step": 13362 + }, + { + "epoch": 4.932931331528427, + "grad_norm": 0.9453790783882141, + "learning_rate": 3.835592508798755e-07, + "loss": 0.5909, + "step": 13363 + }, + { + "epoch": 4.933300516859464, + "grad_norm": 0.9693858623504639, + "learning_rate": 3.8330040944706245e-07, + "loss": 0.5753, + "step": 13364 + }, + { + "epoch": 4.9336697021905, + "grad_norm": 0.9517672657966614, + "learning_rate": 3.8304164813066173e-07, + "loss": 0.5963, + "step": 13365 + }, + { + "epoch": 4.9340388875215355, + "grad_norm": 0.9178259968757629, + "learning_rate": 3.8278296694046645e-07, + "loss": 0.5981, + "step": 13366 + }, + { + "epoch": 4.934408072852572, + "grad_norm": 1.0005826950073242, + "learning_rate": 3.8252436588626706e-07, + "loss": 0.5882, + "step": 13367 + }, + { + "epoch": 4.934777258183608, + "grad_norm": 1.013216257095337, + "learning_rate": 3.822658449778527e-07, + "loss": 0.5611, + "step": 13368 + }, + { + "epoch": 4.935146443514644, + "grad_norm": 0.978979766368866, + "learning_rate": 3.820074042250077e-07, + "loss": 0.5787, + "step": 13369 + }, + { + "epoch": 4.93551562884568, + "grad_norm": 0.9602137207984924, + "learning_rate": 3.817490436375135e-07, + "loss": 0.5518, + "step": 13370 + }, + { + "epoch": 4.935884814176717, + "grad_norm": 0.9768320322036743, + "learning_rate": 3.814907632251502e-07, + "loss": 0.6054, + "step": 13371 + }, + { + "epoch": 4.936253999507753, + "grad_norm": 0.9776341915130615, + "learning_rate": 3.8123256299769206e-07, + "loss": 0.5746, + "step": 13372 + }, + { + "epoch": 4.936623184838789, + "grad_norm": 0.9504129886627197, + "learning_rate": 3.8097444296491343e-07, + "loss": 0.5396, + "step": 13373 + }, + { + "epoch": 4.9369923701698255, + "grad_norm": 0.9573213458061218, + "learning_rate": 3.8071640313658344e-07, + "loss": 0.5614, + "step": 13374 + }, + { + "epoch": 4.937361555500861, + "grad_norm": 0.9656904339790344, + "learning_rate": 3.8045844352246784e-07, + "loss": 0.5487, + "step": 13375 + }, + { + "epoch": 4.937730740831897, + "grad_norm": 0.9632173180580139, + "learning_rate": 3.8020056413233197e-07, + "loss": 0.5475, + "step": 13376 + }, + { + "epoch": 4.938099926162934, + "grad_norm": 0.93343585729599, + "learning_rate": 3.7994276497593576e-07, + "loss": 0.5953, + "step": 13377 + }, + { + "epoch": 4.93846911149397, + "grad_norm": 1.0099618434906006, + "learning_rate": 3.7968504606303636e-07, + "loss": 0.5792, + "step": 13378 + }, + { + "epoch": 4.938838296825006, + "grad_norm": 0.9613112807273865, + "learning_rate": 3.794274074033891e-07, + "loss": 0.5645, + "step": 13379 + }, + { + "epoch": 4.939207482156043, + "grad_norm": 0.9726617932319641, + "learning_rate": 3.791698490067447e-07, + "loss": 0.6148, + "step": 13380 + }, + { + "epoch": 4.939576667487079, + "grad_norm": 0.9854781031608582, + "learning_rate": 3.789123708828535e-07, + "loss": 0.5661, + "step": 13381 + }, + { + "epoch": 4.939945852818115, + "grad_norm": 0.9641531705856323, + "learning_rate": 3.786549730414582e-07, + "loss": 0.5456, + "step": 13382 + }, + { + "epoch": 4.9403150381491505, + "grad_norm": 0.8968275785446167, + "learning_rate": 3.7839765549230303e-07, + "loss": 0.5838, + "step": 13383 + }, + { + "epoch": 4.940684223480187, + "grad_norm": 0.9799063205718994, + "learning_rate": 3.781404182451273e-07, + "loss": 0.5893, + "step": 13384 + }, + { + "epoch": 4.941053408811223, + "grad_norm": 0.9484923481941223, + "learning_rate": 3.7788326130966695e-07, + "loss": 0.6044, + "step": 13385 + }, + { + "epoch": 4.941422594142259, + "grad_norm": 0.9602192044258118, + "learning_rate": 3.776261846956558e-07, + "loss": 0.5821, + "step": 13386 + }, + { + "epoch": 4.941791779473296, + "grad_norm": 0.9525284171104431, + "learning_rate": 3.7736918841282275e-07, + "loss": 0.5883, + "step": 13387 + }, + { + "epoch": 4.942160964804332, + "grad_norm": 0.9760873913764954, + "learning_rate": 3.7711227247089643e-07, + "loss": 0.5771, + "step": 13388 + }, + { + "epoch": 4.942530150135368, + "grad_norm": 0.9428760409355164, + "learning_rate": 3.7685543687960154e-07, + "loss": 0.6041, + "step": 13389 + }, + { + "epoch": 4.942899335466404, + "grad_norm": 0.9934290051460266, + "learning_rate": 3.7659868164865693e-07, + "loss": 0.5878, + "step": 13390 + }, + { + "epoch": 4.9432685207974405, + "grad_norm": 0.9772612452507019, + "learning_rate": 3.7634200678778296e-07, + "loss": 0.5735, + "step": 13391 + }, + { + "epoch": 4.943637706128476, + "grad_norm": 0.9720263481140137, + "learning_rate": 3.7608541230669317e-07, + "loss": 0.5951, + "step": 13392 + }, + { + "epoch": 4.944006891459512, + "grad_norm": 0.9460076093673706, + "learning_rate": 3.7582889821510065e-07, + "loss": 0.5697, + "step": 13393 + }, + { + "epoch": 4.944376076790549, + "grad_norm": 0.9588996767997742, + "learning_rate": 3.7557246452271405e-07, + "loss": 0.6074, + "step": 13394 + }, + { + "epoch": 4.944745262121585, + "grad_norm": 0.9893524050712585, + "learning_rate": 3.753161112392384e-07, + "loss": 0.5941, + "step": 13395 + }, + { + "epoch": 4.945114447452621, + "grad_norm": 0.9671381711959839, + "learning_rate": 3.750598383743781e-07, + "loss": 0.5382, + "step": 13396 + }, + { + "epoch": 4.945483632783658, + "grad_norm": 0.9511076807975769, + "learning_rate": 3.7480364593783237e-07, + "loss": 0.587, + "step": 13397 + }, + { + "epoch": 4.945852818114694, + "grad_norm": 0.9852848649024963, + "learning_rate": 3.745475339392973e-07, + "loss": 0.5273, + "step": 13398 + }, + { + "epoch": 4.94622200344573, + "grad_norm": 0.9533817172050476, + "learning_rate": 3.7429150238846773e-07, + "loss": 0.5651, + "step": 13399 + }, + { + "epoch": 4.946591188776766, + "grad_norm": 0.9590192437171936, + "learning_rate": 3.740355512950336e-07, + "loss": 0.5861, + "step": 13400 + }, + { + "epoch": 4.946960374107802, + "grad_norm": 0.9527587294578552, + "learning_rate": 3.7377968066868387e-07, + "loss": 0.5872, + "step": 13401 + }, + { + "epoch": 4.947329559438838, + "grad_norm": 0.9996702075004578, + "learning_rate": 3.7352389051910076e-07, + "loss": 0.5767, + "step": 13402 + }, + { + "epoch": 4.947698744769874, + "grad_norm": 0.9493252038955688, + "learning_rate": 3.732681808559677e-07, + "loss": 0.5742, + "step": 13403 + }, + { + "epoch": 4.948067930100911, + "grad_norm": 0.9384365677833557, + "learning_rate": 3.730125516889632e-07, + "loss": 0.6034, + "step": 13404 + }, + { + "epoch": 4.948437115431947, + "grad_norm": 1.685205340385437, + "learning_rate": 3.727570030277622e-07, + "loss": 0.5585, + "step": 13405 + }, + { + "epoch": 4.948806300762983, + "grad_norm": 0.9437605142593384, + "learning_rate": 3.7250153488203736e-07, + "loss": 0.5719, + "step": 13406 + }, + { + "epoch": 4.94917548609402, + "grad_norm": 0.9949869513511658, + "learning_rate": 3.722461472614575e-07, + "loss": 0.5753, + "step": 13407 + }, + { + "epoch": 4.9495446714250555, + "grad_norm": 0.9521825313568115, + "learning_rate": 3.719908401756894e-07, + "loss": 0.5752, + "step": 13408 + }, + { + "epoch": 4.949913856756091, + "grad_norm": 0.9669532179832458, + "learning_rate": 3.717356136343975e-07, + "loss": 0.5846, + "step": 13409 + }, + { + "epoch": 4.950283042087127, + "grad_norm": 0.9711313843727112, + "learning_rate": 3.7148046764723973e-07, + "loss": 0.5562, + "step": 13410 + }, + { + "epoch": 4.950652227418164, + "grad_norm": 0.9508935809135437, + "learning_rate": 3.712254022238751e-07, + "loss": 0.5616, + "step": 13411 + }, + { + "epoch": 4.9510214127492, + "grad_norm": 0.945927083492279, + "learning_rate": 3.709704173739567e-07, + "loss": 0.6205, + "step": 13412 + }, + { + "epoch": 4.951390598080236, + "grad_norm": 0.9675004482269287, + "learning_rate": 3.7071551310713633e-07, + "loss": 0.5903, + "step": 13413 + }, + { + "epoch": 4.951759783411273, + "grad_norm": 0.9295480251312256, + "learning_rate": 3.70460689433062e-07, + "loss": 0.5657, + "step": 13414 + }, + { + "epoch": 4.952128968742309, + "grad_norm": 0.9376156330108643, + "learning_rate": 3.70205946361378e-07, + "loss": 0.5794, + "step": 13415 + }, + { + "epoch": 4.952498154073345, + "grad_norm": 0.9444271326065063, + "learning_rate": 3.6995128390172665e-07, + "loss": 0.5424, + "step": 13416 + }, + { + "epoch": 4.952867339404381, + "grad_norm": 0.9332671165466309, + "learning_rate": 3.696967020637482e-07, + "loss": 0.5374, + "step": 13417 + }, + { + "epoch": 4.953236524735417, + "grad_norm": 0.9391049742698669, + "learning_rate": 3.6944220085707584e-07, + "loss": 0.5532, + "step": 13418 + }, + { + "epoch": 4.953605710066453, + "grad_norm": 0.9507493376731873, + "learning_rate": 3.691877802913446e-07, + "loss": 0.5417, + "step": 13419 + }, + { + "epoch": 4.95397489539749, + "grad_norm": 0.9785206317901611, + "learning_rate": 3.689334403761827e-07, + "loss": 0.5666, + "step": 13420 + }, + { + "epoch": 4.954344080728526, + "grad_norm": 0.9781269431114197, + "learning_rate": 3.6867918112121786e-07, + "loss": 0.5694, + "step": 13421 + }, + { + "epoch": 4.954713266059562, + "grad_norm": 0.9666619896888733, + "learning_rate": 3.684250025360736e-07, + "loss": 0.5939, + "step": 13422 + }, + { + "epoch": 4.955082451390598, + "grad_norm": 0.9648899435997009, + "learning_rate": 3.681709046303697e-07, + "loss": 0.5624, + "step": 13423 + }, + { + "epoch": 4.955451636721635, + "grad_norm": 0.9713200330734253, + "learning_rate": 3.6791688741372484e-07, + "loss": 0.5696, + "step": 13424 + }, + { + "epoch": 4.9558208220526705, + "grad_norm": 0.974109411239624, + "learning_rate": 3.676629508957527e-07, + "loss": 0.5868, + "step": 13425 + }, + { + "epoch": 4.956190007383706, + "grad_norm": 0.962918221950531, + "learning_rate": 3.6740909508606456e-07, + "loss": 0.5606, + "step": 13426 + }, + { + "epoch": 4.956559192714742, + "grad_norm": 0.9531246423721313, + "learning_rate": 3.671553199942696e-07, + "loss": 0.5817, + "step": 13427 + }, + { + "epoch": 4.956928378045779, + "grad_norm": 0.9713483452796936, + "learning_rate": 3.669016256299721e-07, + "loss": 0.5897, + "step": 13428 + }, + { + "epoch": 4.957297563376815, + "grad_norm": 0.9848222732543945, + "learning_rate": 3.66648012002776e-07, + "loss": 0.5354, + "step": 13429 + }, + { + "epoch": 4.957666748707851, + "grad_norm": 0.9197826981544495, + "learning_rate": 3.6639447912227787e-07, + "loss": 0.5828, + "step": 13430 + }, + { + "epoch": 4.958035934038888, + "grad_norm": 0.9740970134735107, + "learning_rate": 3.6614102699807585e-07, + "loss": 0.5656, + "step": 13431 + }, + { + "epoch": 4.958405119369924, + "grad_norm": 0.9670155048370361, + "learning_rate": 3.658876556397628e-07, + "loss": 0.5493, + "step": 13432 + }, + { + "epoch": 4.95877430470096, + "grad_norm": 0.9522271156311035, + "learning_rate": 3.656343650569286e-07, + "loss": 0.5618, + "step": 13433 + }, + { + "epoch": 4.959143490031996, + "grad_norm": 0.9417773485183716, + "learning_rate": 3.653811552591599e-07, + "loss": 0.5547, + "step": 13434 + }, + { + "epoch": 4.959512675363032, + "grad_norm": 0.9520286917686462, + "learning_rate": 3.6512802625604025e-07, + "loss": 0.5895, + "step": 13435 + }, + { + "epoch": 4.959881860694068, + "grad_norm": 0.9723545908927917, + "learning_rate": 3.6487497805715086e-07, + "loss": 0.6124, + "step": 13436 + }, + { + "epoch": 4.960251046025105, + "grad_norm": 0.9622791409492493, + "learning_rate": 3.64622010672071e-07, + "loss": 0.5914, + "step": 13437 + }, + { + "epoch": 4.960620231356141, + "grad_norm": 0.9674283862113953, + "learning_rate": 3.643691241103731e-07, + "loss": 0.5714, + "step": 13438 + }, + { + "epoch": 4.960989416687177, + "grad_norm": 0.936915934085846, + "learning_rate": 3.6411631838163016e-07, + "loss": 0.5932, + "step": 13439 + }, + { + "epoch": 4.961358602018213, + "grad_norm": 0.9520227313041687, + "learning_rate": 3.638635934954102e-07, + "loss": 0.5685, + "step": 13440 + }, + { + "epoch": 4.96172778734925, + "grad_norm": 0.9389573335647583, + "learning_rate": 3.636109494612794e-07, + "loss": 0.5509, + "step": 13441 + }, + { + "epoch": 4.9620969726802855, + "grad_norm": 0.9380732774734497, + "learning_rate": 3.6335838628879987e-07, + "loss": 0.5427, + "step": 13442 + }, + { + "epoch": 4.962466158011321, + "grad_norm": 0.9538351893424988, + "learning_rate": 3.6310590398753066e-07, + "loss": 0.611, + "step": 13443 + }, + { + "epoch": 4.962835343342358, + "grad_norm": 0.9604706764221191, + "learning_rate": 3.6285350256702905e-07, + "loss": 0.5441, + "step": 13444 + }, + { + "epoch": 4.963204528673394, + "grad_norm": 0.9596091508865356, + "learning_rate": 3.6260118203684803e-07, + "loss": 0.5721, + "step": 13445 + }, + { + "epoch": 4.96357371400443, + "grad_norm": 0.9395207762718201, + "learning_rate": 3.6234894240653684e-07, + "loss": 0.5891, + "step": 13446 + }, + { + "epoch": 4.963942899335466, + "grad_norm": 1.2204569578170776, + "learning_rate": 3.620967836856443e-07, + "loss": 0.5239, + "step": 13447 + }, + { + "epoch": 4.964312084666503, + "grad_norm": 0.9353388547897339, + "learning_rate": 3.618447058837132e-07, + "loss": 0.5465, + "step": 13448 + }, + { + "epoch": 4.964681269997539, + "grad_norm": 0.9452176094055176, + "learning_rate": 3.615927090102861e-07, + "loss": 0.5642, + "step": 13449 + }, + { + "epoch": 4.965050455328575, + "grad_norm": 0.9776899218559265, + "learning_rate": 3.613407930748991e-07, + "loss": 0.5551, + "step": 13450 + }, + { + "epoch": 4.965419640659611, + "grad_norm": 0.9816679954528809, + "learning_rate": 3.6108895808708827e-07, + "loss": 0.5579, + "step": 13451 + }, + { + "epoch": 4.965788825990647, + "grad_norm": 0.9425308108329773, + "learning_rate": 3.608372040563857e-07, + "loss": 0.6092, + "step": 13452 + }, + { + "epoch": 4.966158011321683, + "grad_norm": 0.9617993831634521, + "learning_rate": 3.605855309923198e-07, + "loss": 0.5085, + "step": 13453 + }, + { + "epoch": 4.96652719665272, + "grad_norm": 0.9630081057548523, + "learning_rate": 3.603339389044158e-07, + "loss": 0.581, + "step": 13454 + }, + { + "epoch": 4.966896381983756, + "grad_norm": 0.9692290425300598, + "learning_rate": 3.600824278021975e-07, + "loss": 0.5911, + "step": 13455 + }, + { + "epoch": 4.967265567314792, + "grad_norm": 0.938564658164978, + "learning_rate": 3.598309976951836e-07, + "loss": 0.5951, + "step": 13456 + }, + { + "epoch": 4.967634752645829, + "grad_norm": 0.9855772256851196, + "learning_rate": 3.595796485928918e-07, + "loss": 0.6066, + "step": 13457 + }, + { + "epoch": 4.968003937976865, + "grad_norm": 0.9678559303283691, + "learning_rate": 3.593283805048337e-07, + "loss": 0.5719, + "step": 13458 + }, + { + "epoch": 4.9683731233079005, + "grad_norm": 0.9671450257301331, + "learning_rate": 3.59077193440521e-07, + "loss": 0.5712, + "step": 13459 + }, + { + "epoch": 4.968742308638936, + "grad_norm": 0.9612677097320557, + "learning_rate": 3.588260874094612e-07, + "loss": 0.5695, + "step": 13460 + }, + { + "epoch": 4.969111493969973, + "grad_norm": 0.9616091251373291, + "learning_rate": 3.5857506242115813e-07, + "loss": 0.562, + "step": 13461 + }, + { + "epoch": 4.969480679301009, + "grad_norm": 0.9795394539833069, + "learning_rate": 3.583241184851133e-07, + "loss": 0.5814, + "step": 13462 + }, + { + "epoch": 4.969849864632045, + "grad_norm": 0.9904385209083557, + "learning_rate": 3.580732556108241e-07, + "loss": 0.5861, + "step": 13463 + }, + { + "epoch": 4.970219049963082, + "grad_norm": 0.968606173992157, + "learning_rate": 3.5782247380778677e-07, + "loss": 0.5194, + "step": 13464 + }, + { + "epoch": 4.970588235294118, + "grad_norm": 0.9541077613830566, + "learning_rate": 3.5757177308549275e-07, + "loss": 0.5331, + "step": 13465 + }, + { + "epoch": 4.970957420625154, + "grad_norm": 0.9943683743476868, + "learning_rate": 3.5732115345343016e-07, + "loss": 0.5542, + "step": 13466 + }, + { + "epoch": 4.97132660595619, + "grad_norm": 0.9496219754219055, + "learning_rate": 3.5707061492108645e-07, + "loss": 0.5284, + "step": 13467 + }, + { + "epoch": 4.971695791287226, + "grad_norm": 0.9609407782554626, + "learning_rate": 3.5682015749794296e-07, + "loss": 0.5809, + "step": 13468 + }, + { + "epoch": 4.972064976618262, + "grad_norm": 0.9565087556838989, + "learning_rate": 3.5656978119348066e-07, + "loss": 0.5862, + "step": 13469 + }, + { + "epoch": 4.972434161949298, + "grad_norm": 0.9692551493644714, + "learning_rate": 3.5631948601717565e-07, + "loss": 0.5443, + "step": 13470 + }, + { + "epoch": 4.972803347280335, + "grad_norm": 0.966866135597229, + "learning_rate": 3.5606927197850113e-07, + "loss": 0.575, + "step": 13471 + }, + { + "epoch": 4.973172532611371, + "grad_norm": 0.9608084559440613, + "learning_rate": 3.5581913908692836e-07, + "loss": 0.5689, + "step": 13472 + }, + { + "epoch": 4.973541717942407, + "grad_norm": 0.9855614304542542, + "learning_rate": 3.5556908735192464e-07, + "loss": 0.5716, + "step": 13473 + }, + { + "epoch": 4.973910903273444, + "grad_norm": 0.9852381348609924, + "learning_rate": 3.553191167829537e-07, + "loss": 0.5738, + "step": 13474 + }, + { + "epoch": 4.97428008860448, + "grad_norm": 0.9442415237426758, + "learning_rate": 3.550692273894776e-07, + "loss": 0.544, + "step": 13475 + }, + { + "epoch": 4.9746492739355155, + "grad_norm": 0.964142382144928, + "learning_rate": 3.54819419180954e-07, + "loss": 0.5864, + "step": 13476 + }, + { + "epoch": 4.975018459266552, + "grad_norm": 0.972114622592926, + "learning_rate": 3.545696921668393e-07, + "loss": 0.5529, + "step": 13477 + }, + { + "epoch": 4.975387644597588, + "grad_norm": 0.9292877316474915, + "learning_rate": 3.543200463565838e-07, + "loss": 0.5956, + "step": 13478 + }, + { + "epoch": 4.975756829928624, + "grad_norm": 0.9777455925941467, + "learning_rate": 3.5407048175963714e-07, + "loss": 0.5834, + "step": 13479 + }, + { + "epoch": 4.97612601525966, + "grad_norm": 0.9338268637657166, + "learning_rate": 3.53820998385446e-07, + "loss": 0.5936, + "step": 13480 + }, + { + "epoch": 4.976495200590697, + "grad_norm": 0.9782612323760986, + "learning_rate": 3.535715962434527e-07, + "loss": 0.5538, + "step": 13481 + }, + { + "epoch": 4.976864385921733, + "grad_norm": 0.9603361487388611, + "learning_rate": 3.533222753430973e-07, + "loss": 0.5539, + "step": 13482 + }, + { + "epoch": 4.977233571252769, + "grad_norm": 0.9680009484291077, + "learning_rate": 3.5307303569381556e-07, + "loss": 0.5662, + "step": 13483 + }, + { + "epoch": 4.977602756583805, + "grad_norm": 0.9875103235244751, + "learning_rate": 3.528238773050421e-07, + "loss": 0.6052, + "step": 13484 + }, + { + "epoch": 4.977971941914841, + "grad_norm": 0.9866600036621094, + "learning_rate": 3.5257480018620784e-07, + "loss": 0.5712, + "step": 13485 + }, + { + "epoch": 4.978341127245877, + "grad_norm": 0.9467758536338806, + "learning_rate": 3.523258043467387e-07, + "loss": 0.5693, + "step": 13486 + }, + { + "epoch": 4.978710312576913, + "grad_norm": 0.9566193222999573, + "learning_rate": 3.520768897960605e-07, + "loss": 0.5653, + "step": 13487 + }, + { + "epoch": 4.97907949790795, + "grad_norm": 0.9661201238632202, + "learning_rate": 3.5182805654359386e-07, + "loss": 0.5852, + "step": 13488 + }, + { + "epoch": 4.979448683238986, + "grad_norm": 0.948600172996521, + "learning_rate": 3.5157930459875784e-07, + "loss": 0.5412, + "step": 13489 + }, + { + "epoch": 4.979817868570022, + "grad_norm": 0.9669228792190552, + "learning_rate": 3.513306339709668e-07, + "loss": 0.5741, + "step": 13490 + }, + { + "epoch": 4.980187053901059, + "grad_norm": 0.924223780632019, + "learning_rate": 3.510820446696328e-07, + "loss": 0.5702, + "step": 13491 + }, + { + "epoch": 4.980556239232095, + "grad_norm": 0.9888768792152405, + "learning_rate": 3.508335367041657e-07, + "loss": 0.5651, + "step": 13492 + }, + { + "epoch": 4.9809254245631305, + "grad_norm": 0.9681922793388367, + "learning_rate": 3.5058511008397105e-07, + "loss": 0.5861, + "step": 13493 + }, + { + "epoch": 4.981294609894167, + "grad_norm": 0.9481013417243958, + "learning_rate": 3.503367648184511e-07, + "loss": 0.575, + "step": 13494 + }, + { + "epoch": 4.981663795225203, + "grad_norm": 0.9574298858642578, + "learning_rate": 3.500885009170066e-07, + "loss": 0.5936, + "step": 13495 + }, + { + "epoch": 4.982032980556239, + "grad_norm": 0.9820094108581543, + "learning_rate": 3.498403183890331e-07, + "loss": 0.5859, + "step": 13496 + }, + { + "epoch": 4.982402165887275, + "grad_norm": 0.9701007604598999, + "learning_rate": 3.495922172439259e-07, + "loss": 0.5792, + "step": 13497 + }, + { + "epoch": 4.982771351218312, + "grad_norm": 0.9402222633361816, + "learning_rate": 3.493441974910744e-07, + "loss": 0.5648, + "step": 13498 + }, + { + "epoch": 4.983140536549348, + "grad_norm": 0.9436431527137756, + "learning_rate": 3.490962591398656e-07, + "loss": 0.5779, + "step": 13499 + }, + { + "epoch": 4.983509721880384, + "grad_norm": 0.9784184694290161, + "learning_rate": 3.488484021996852e-07, + "loss": 0.5889, + "step": 13500 + }, + { + "epoch": 4.9838789072114205, + "grad_norm": 0.9913752675056458, + "learning_rate": 3.4860062667991386e-07, + "loss": 0.5508, + "step": 13501 + }, + { + "epoch": 4.984248092542456, + "grad_norm": 0.93379145860672, + "learning_rate": 3.483529325899293e-07, + "loss": 0.5891, + "step": 13502 + }, + { + "epoch": 4.984617277873492, + "grad_norm": 0.9562308192253113, + "learning_rate": 3.481053199391077e-07, + "loss": 0.563, + "step": 13503 + }, + { + "epoch": 4.984986463204528, + "grad_norm": 0.9739440083503723, + "learning_rate": 3.478577887368198e-07, + "loss": 0.5842, + "step": 13504 + }, + { + "epoch": 4.985355648535565, + "grad_norm": 0.9617305397987366, + "learning_rate": 3.476103389924368e-07, + "loss": 0.5977, + "step": 13505 + }, + { + "epoch": 4.985724833866601, + "grad_norm": 0.9413610100746155, + "learning_rate": 3.47362970715322e-07, + "loss": 0.5756, + "step": 13506 + }, + { + "epoch": 4.986094019197637, + "grad_norm": 1.0003515481948853, + "learning_rate": 3.471156839148393e-07, + "loss": 0.5961, + "step": 13507 + }, + { + "epoch": 4.986463204528674, + "grad_norm": 0.9463071227073669, + "learning_rate": 3.46868478600349e-07, + "loss": 0.57, + "step": 13508 + }, + { + "epoch": 4.98683238985971, + "grad_norm": 0.9938381314277649, + "learning_rate": 3.4662135478120753e-07, + "loss": 0.569, + "step": 13509 + }, + { + "epoch": 4.9872015751907455, + "grad_norm": 0.9620950818061829, + "learning_rate": 3.463743124667679e-07, + "loss": 0.5631, + "step": 13510 + }, + { + "epoch": 4.987570760521782, + "grad_norm": 0.9525800347328186, + "learning_rate": 3.4612735166638016e-07, + "loss": 0.553, + "step": 13511 + }, + { + "epoch": 4.987939945852818, + "grad_norm": 0.9706422090530396, + "learning_rate": 3.4588047238939326e-07, + "loss": 0.5629, + "step": 13512 + }, + { + "epoch": 4.988309131183854, + "grad_norm": 0.9819415807723999, + "learning_rate": 3.456336746451505e-07, + "loss": 0.6075, + "step": 13513 + }, + { + "epoch": 4.988678316514891, + "grad_norm": 0.9691826701164246, + "learning_rate": 3.453869584429928e-07, + "loss": 0.5936, + "step": 13514 + }, + { + "epoch": 4.989047501845927, + "grad_norm": 0.9647552371025085, + "learning_rate": 3.45140323792259e-07, + "loss": 0.5857, + "step": 13515 + }, + { + "epoch": 4.989416687176963, + "grad_norm": 0.9924982190132141, + "learning_rate": 3.448937707022834e-07, + "loss": 0.5684, + "step": 13516 + }, + { + "epoch": 4.989785872507999, + "grad_norm": 0.9415305256843567, + "learning_rate": 3.446472991823993e-07, + "loss": 0.5757, + "step": 13517 + }, + { + "epoch": 4.9901550578390355, + "grad_norm": 0.9728958606719971, + "learning_rate": 3.4440090924193426e-07, + "loss": 0.5699, + "step": 13518 + }, + { + "epoch": 4.990524243170071, + "grad_norm": 0.9479605555534363, + "learning_rate": 3.4415460089021414e-07, + "loss": 0.5879, + "step": 13519 + }, + { + "epoch": 4.990893428501107, + "grad_norm": 0.9628505706787109, + "learning_rate": 3.4390837413656256e-07, + "loss": 0.5305, + "step": 13520 + }, + { + "epoch": 4.991262613832144, + "grad_norm": 0.949302613735199, + "learning_rate": 3.436622289902983e-07, + "loss": 0.5578, + "step": 13521 + }, + { + "epoch": 4.99163179916318, + "grad_norm": 0.9663687348365784, + "learning_rate": 3.434161654607379e-07, + "loss": 0.6303, + "step": 13522 + }, + { + "epoch": 4.992000984494216, + "grad_norm": 1.0215343236923218, + "learning_rate": 3.431701835571954e-07, + "loss": 0.5689, + "step": 13523 + }, + { + "epoch": 4.992370169825252, + "grad_norm": 0.9819056987762451, + "learning_rate": 3.429242832889801e-07, + "loss": 0.5828, + "step": 13524 + }, + { + "epoch": 4.992739355156289, + "grad_norm": 0.9601650834083557, + "learning_rate": 3.4267846466540106e-07, + "loss": 0.5603, + "step": 13525 + }, + { + "epoch": 4.9931085404873246, + "grad_norm": 0.9312421083450317, + "learning_rate": 3.4243272769576035e-07, + "loss": 0.5465, + "step": 13526 + }, + { + "epoch": 4.9934777258183605, + "grad_norm": 0.9699046611785889, + "learning_rate": 3.421870723893597e-07, + "loss": 0.5703, + "step": 13527 + }, + { + "epoch": 4.993846911149397, + "grad_norm": 0.9554951786994934, + "learning_rate": 3.419414987554981e-07, + "loss": 0.5774, + "step": 13528 + }, + { + "epoch": 4.994216096480433, + "grad_norm": 0.9631796479225159, + "learning_rate": 3.416960068034694e-07, + "loss": 0.5862, + "step": 13529 + }, + { + "epoch": 4.994585281811469, + "grad_norm": 0.9967352151870728, + "learning_rate": 3.4145059654256544e-07, + "loss": 0.5781, + "step": 13530 + }, + { + "epoch": 4.994954467142506, + "grad_norm": 0.9460363984107971, + "learning_rate": 3.412052679820757e-07, + "loss": 0.5913, + "step": 13531 + }, + { + "epoch": 4.995323652473542, + "grad_norm": 0.9789657592773438, + "learning_rate": 3.4096002113128495e-07, + "loss": 0.5997, + "step": 13532 + }, + { + "epoch": 4.995692837804578, + "grad_norm": 0.9679410457611084, + "learning_rate": 3.407148559994761e-07, + "loss": 0.5413, + "step": 13533 + }, + { + "epoch": 4.9960620231356145, + "grad_norm": 0.9603450894355774, + "learning_rate": 3.40469772595928e-07, + "loss": 0.5907, + "step": 13534 + }, + { + "epoch": 4.9964312084666505, + "grad_norm": 0.9865407347679138, + "learning_rate": 3.402247709299175e-07, + "loss": 0.5814, + "step": 13535 + }, + { + "epoch": 4.996800393797686, + "grad_norm": 0.9246069192886353, + "learning_rate": 3.399798510107183e-07, + "loss": 0.5765, + "step": 13536 + }, + { + "epoch": 4.997169579128722, + "grad_norm": 0.9747608304023743, + "learning_rate": 3.3973501284759983e-07, + "loss": 0.5704, + "step": 13537 + }, + { + "epoch": 4.997538764459759, + "grad_norm": 0.9710931777954102, + "learning_rate": 3.394902564498295e-07, + "loss": 0.5436, + "step": 13538 + }, + { + "epoch": 4.997907949790795, + "grad_norm": 0.9734852910041809, + "learning_rate": 3.3924558182667073e-07, + "loss": 0.5661, + "step": 13539 + }, + { + "epoch": 4.998277135121831, + "grad_norm": 0.9661197662353516, + "learning_rate": 3.390009889873849e-07, + "loss": 0.5598, + "step": 13540 + } + ], + "logging_steps": 1, + "max_steps": 16248, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 2708, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.419916882520572e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}