diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.6633499170812603, + "epoch": 1.0, "eval_steps": 500, - "global_step": 12000, + "global_step": 18090, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -16807,6 +16807,8532 @@ "learning_rate": 1.731097214326322e-05, "loss": 4.8989, "step": 12000 + }, + { + "epoch": 0.6636263128800443, + "grad_norm": 3.782233953475952, + "learning_rate": 1.729675952245594e-05, + "loss": 4.989, + "step": 12005 + }, + { + "epoch": 0.6639027086788281, + "grad_norm": 3.869248867034912, + "learning_rate": 1.7282546901648665e-05, + "loss": 4.8152, + "step": 12010 + }, + { + "epoch": 0.664179104477612, + "grad_norm": 3.3503940105438232, + "learning_rate": 1.726833428084139e-05, + "loss": 5.0352, + "step": 12015 + }, + { + "epoch": 0.6644555002763958, + "grad_norm": 3.2065019607543945, + "learning_rate": 1.7254121660034112e-05, + "loss": 4.8756, + "step": 12020 + }, + { + "epoch": 0.6647318960751797, + "grad_norm": 3.2806355953216553, + "learning_rate": 1.7239909039226833e-05, + "loss": 4.9745, + "step": 12025 + }, + { + "epoch": 0.6650082918739635, + "grad_norm": 3.366260290145874, + "learning_rate": 1.7225696418419557e-05, + "loss": 4.9347, + "step": 12030 + }, + { + "epoch": 0.6652846876727474, + "grad_norm": 3.2907090187072754, + "learning_rate": 1.721148379761228e-05, + "loss": 4.9765, + "step": 12035 + }, + { + "epoch": 0.6655610834715312, + "grad_norm": 4.094642639160156, + "learning_rate": 1.7197271176805005e-05, + "loss": 4.8334, + "step": 12040 + }, + { + "epoch": 0.6658374792703151, + "grad_norm": 3.0640034675598145, + "learning_rate": 1.718305855599773e-05, + "loss": 4.6388, + "step": 12045 + }, + { + "epoch": 0.6661138750690989, + "grad_norm": 3.662433385848999, + "learning_rate": 1.716884593519045e-05, + "loss": 4.8606, + "step": 12050 + }, + { + "epoch": 0.6663902708678828, + "grad_norm": 4.076939105987549, + "learning_rate": 1.7154633314383174e-05, + "loss": 4.8626, + "step": 12055 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 3.6793127059936523, + "learning_rate": 1.7140420693575894e-05, + "loss": 4.8432, + "step": 12060 + }, + { + "epoch": 0.6669430624654505, + "grad_norm": 3.6468772888183594, + "learning_rate": 1.7126208072768618e-05, + "loss": 4.7681, + "step": 12065 + }, + { + "epoch": 0.6672194582642343, + "grad_norm": 3.1914479732513428, + "learning_rate": 1.7111995451961342e-05, + "loss": 5.0191, + "step": 12070 + }, + { + "epoch": 0.6674958540630183, + "grad_norm": 3.367316484451294, + "learning_rate": 1.7097782831154066e-05, + "loss": 5.0413, + "step": 12075 + }, + { + "epoch": 0.6677722498618021, + "grad_norm": 3.765392780303955, + "learning_rate": 1.708357021034679e-05, + "loss": 4.8648, + "step": 12080 + }, + { + "epoch": 0.668048645660586, + "grad_norm": 3.203326463699341, + "learning_rate": 1.706935758953951e-05, + "loss": 4.9687, + "step": 12085 + }, + { + "epoch": 0.6683250414593698, + "grad_norm": 3.558098554611206, + "learning_rate": 1.7055144968732235e-05, + "loss": 5.1091, + "step": 12090 + }, + { + "epoch": 0.6686014372581537, + "grad_norm": 2.9107744693756104, + "learning_rate": 1.704093234792496e-05, + "loss": 4.8894, + "step": 12095 + }, + { + "epoch": 0.6688778330569375, + "grad_norm": 3.141874074935913, + "learning_rate": 1.7026719727117683e-05, + "loss": 4.7187, + "step": 12100 + }, + { + "epoch": 0.6691542288557214, + "grad_norm": 3.327786684036255, + "learning_rate": 1.7012507106310403e-05, + "loss": 5.34, + "step": 12105 + }, + { + "epoch": 0.6694306246545052, + "grad_norm": 3.6189236640930176, + "learning_rate": 1.6998294485503127e-05, + "loss": 5.1619, + "step": 12110 + }, + { + "epoch": 0.6697070204532891, + "grad_norm": 3.134925365447998, + "learning_rate": 1.698408186469585e-05, + "loss": 5.4205, + "step": 12115 + }, + { + "epoch": 0.6699834162520729, + "grad_norm": 3.9740278720855713, + "learning_rate": 1.6969869243888572e-05, + "loss": 4.929, + "step": 12120 + }, + { + "epoch": 0.6702598120508568, + "grad_norm": 3.459599494934082, + "learning_rate": 1.69556566230813e-05, + "loss": 5.0055, + "step": 12125 + }, + { + "epoch": 0.6705362078496406, + "grad_norm": 3.837351083755493, + "learning_rate": 1.694144400227402e-05, + "loss": 5.0351, + "step": 12130 + }, + { + "epoch": 0.6708126036484245, + "grad_norm": 4.666230201721191, + "learning_rate": 1.6927231381466744e-05, + "loss": 5.0094, + "step": 12135 + }, + { + "epoch": 0.6710889994472085, + "grad_norm": 4.403833866119385, + "learning_rate": 1.6913018760659464e-05, + "loss": 5.0496, + "step": 12140 + }, + { + "epoch": 0.6713653952459923, + "grad_norm": 3.412066698074341, + "learning_rate": 1.6898806139852188e-05, + "loss": 5.5492, + "step": 12145 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 4.21888542175293, + "learning_rate": 1.6884593519044916e-05, + "loss": 4.8999, + "step": 12150 + }, + { + "epoch": 0.67191818684356, + "grad_norm": 3.7332446575164795, + "learning_rate": 1.6870380898237636e-05, + "loss": 5.1613, + "step": 12155 + }, + { + "epoch": 0.6721945826423439, + "grad_norm": 3.9630868434906006, + "learning_rate": 1.685616827743036e-05, + "loss": 5.1332, + "step": 12160 + }, + { + "epoch": 0.6724709784411277, + "grad_norm": 3.0835654735565186, + "learning_rate": 1.684195565662308e-05, + "loss": 5.3016, + "step": 12165 + }, + { + "epoch": 0.6727473742399116, + "grad_norm": 3.015239715576172, + "learning_rate": 1.6827743035815805e-05, + "loss": 5.2528, + "step": 12170 + }, + { + "epoch": 0.6730237700386954, + "grad_norm": 3.449125051498413, + "learning_rate": 1.681353041500853e-05, + "loss": 4.8346, + "step": 12175 + }, + { + "epoch": 0.6733001658374793, + "grad_norm": 3.250673294067383, + "learning_rate": 1.6799317794201253e-05, + "loss": 5.0164, + "step": 12180 + }, + { + "epoch": 0.6735765616362631, + "grad_norm": 3.8429434299468994, + "learning_rate": 1.6785105173393977e-05, + "loss": 5.326, + "step": 12185 + }, + { + "epoch": 0.673852957435047, + "grad_norm": 3.823730707168579, + "learning_rate": 1.6770892552586697e-05, + "loss": 5.399, + "step": 12190 + }, + { + "epoch": 0.6741293532338308, + "grad_norm": 3.227236747741699, + "learning_rate": 1.675667993177942e-05, + "loss": 4.8976, + "step": 12195 + }, + { + "epoch": 0.6744057490326147, + "grad_norm": 3.5069992542266846, + "learning_rate": 1.6742467310972142e-05, + "loss": 4.9528, + "step": 12200 + }, + { + "epoch": 0.6746821448313985, + "grad_norm": 2.9861481189727783, + "learning_rate": 1.672825469016487e-05, + "loss": 5.1612, + "step": 12205 + }, + { + "epoch": 0.6749585406301825, + "grad_norm": 2.750514507293701, + "learning_rate": 1.671404206935759e-05, + "loss": 4.7478, + "step": 12210 + }, + { + "epoch": 0.6752349364289663, + "grad_norm": 3.113337993621826, + "learning_rate": 1.6699829448550314e-05, + "loss": 5.0428, + "step": 12215 + }, + { + "epoch": 0.6755113322277502, + "grad_norm": 3.744915723800659, + "learning_rate": 1.6685616827743038e-05, + "loss": 4.9286, + "step": 12220 + }, + { + "epoch": 0.675787728026534, + "grad_norm": 2.7885375022888184, + "learning_rate": 1.667140420693576e-05, + "loss": 5.1185, + "step": 12225 + }, + { + "epoch": 0.6760641238253179, + "grad_norm": 3.3463151454925537, + "learning_rate": 1.6657191586128482e-05, + "loss": 4.815, + "step": 12230 + }, + { + "epoch": 0.6763405196241017, + "grad_norm": 3.387303113937378, + "learning_rate": 1.6642978965321206e-05, + "loss": 5.0025, + "step": 12235 + }, + { + "epoch": 0.6766169154228856, + "grad_norm": 3.3395464420318604, + "learning_rate": 1.662876634451393e-05, + "loss": 5.3222, + "step": 12240 + }, + { + "epoch": 0.6768933112216694, + "grad_norm": 3.5981926918029785, + "learning_rate": 1.661455372370665e-05, + "loss": 5.262, + "step": 12245 + }, + { + "epoch": 0.6771697070204533, + "grad_norm": 2.9530062675476074, + "learning_rate": 1.6600341102899375e-05, + "loss": 5.0093, + "step": 12250 + }, + { + "epoch": 0.6774461028192371, + "grad_norm": 2.790177583694458, + "learning_rate": 1.65861284820921e-05, + "loss": 5.178, + "step": 12255 + }, + { + "epoch": 0.677722498618021, + "grad_norm": 3.2655928134918213, + "learning_rate": 1.6571915861284823e-05, + "loss": 4.9004, + "step": 12260 + }, + { + "epoch": 0.6779988944168048, + "grad_norm": 3.1761622428894043, + "learning_rate": 1.6557703240477547e-05, + "loss": 5.2096, + "step": 12265 + }, + { + "epoch": 0.6782752902155887, + "grad_norm": 3.3835296630859375, + "learning_rate": 1.6543490619670267e-05, + "loss": 4.7797, + "step": 12270 + }, + { + "epoch": 0.6785516860143725, + "grad_norm": 3.7050514221191406, + "learning_rate": 1.652927799886299e-05, + "loss": 5.075, + "step": 12275 + }, + { + "epoch": 0.6788280818131565, + "grad_norm": 3.350598096847534, + "learning_rate": 1.6515065378055712e-05, + "loss": 4.9758, + "step": 12280 + }, + { + "epoch": 0.6791044776119403, + "grad_norm": 3.3467860221862793, + "learning_rate": 1.6500852757248436e-05, + "loss": 4.984, + "step": 12285 + }, + { + "epoch": 0.6793808734107242, + "grad_norm": 3.433746814727783, + "learning_rate": 1.648664013644116e-05, + "loss": 4.8733, + "step": 12290 + }, + { + "epoch": 0.679657269209508, + "grad_norm": 3.335195302963257, + "learning_rate": 1.6472427515633884e-05, + "loss": 4.9734, + "step": 12295 + }, + { + "epoch": 0.6799336650082919, + "grad_norm": 3.4817519187927246, + "learning_rate": 1.6458214894826608e-05, + "loss": 4.9225, + "step": 12300 + }, + { + "epoch": 0.6802100608070757, + "grad_norm": 3.4715635776519775, + "learning_rate": 1.644400227401933e-05, + "loss": 5.0767, + "step": 12305 + }, + { + "epoch": 0.6804864566058596, + "grad_norm": 4.440505027770996, + "learning_rate": 1.6429789653212052e-05, + "loss": 4.6531, + "step": 12310 + }, + { + "epoch": 0.6807628524046434, + "grad_norm": 3.239976644515991, + "learning_rate": 1.6415577032404776e-05, + "loss": 5.0474, + "step": 12315 + }, + { + "epoch": 0.6810392482034273, + "grad_norm": 3.019190549850464, + "learning_rate": 1.64013644115975e-05, + "loss": 4.9771, + "step": 12320 + }, + { + "epoch": 0.6813156440022111, + "grad_norm": 3.4960367679595947, + "learning_rate": 1.6387151790790224e-05, + "loss": 5.1239, + "step": 12325 + }, + { + "epoch": 0.681592039800995, + "grad_norm": 4.1001081466674805, + "learning_rate": 1.6372939169982945e-05, + "loss": 5.0819, + "step": 12330 + }, + { + "epoch": 0.6818684355997788, + "grad_norm": 4.748738765716553, + "learning_rate": 1.635872654917567e-05, + "loss": 4.8874, + "step": 12335 + }, + { + "epoch": 0.6821448313985627, + "grad_norm": 3.8815574645996094, + "learning_rate": 1.634451392836839e-05, + "loss": 5.0887, + "step": 12340 + }, + { + "epoch": 0.6824212271973465, + "grad_norm": 3.651639938354492, + "learning_rate": 1.6330301307561117e-05, + "loss": 4.7795, + "step": 12345 + }, + { + "epoch": 0.6826976229961305, + "grad_norm": 3.9696593284606934, + "learning_rate": 1.6316088686753837e-05, + "loss": 5.243, + "step": 12350 + }, + { + "epoch": 0.6829740187949144, + "grad_norm": 3.3984673023223877, + "learning_rate": 1.630187606594656e-05, + "loss": 5.1969, + "step": 12355 + }, + { + "epoch": 0.6832504145936982, + "grad_norm": 3.081726312637329, + "learning_rate": 1.6287663445139285e-05, + "loss": 5.1571, + "step": 12360 + }, + { + "epoch": 0.6835268103924821, + "grad_norm": 3.6568610668182373, + "learning_rate": 1.6273450824332006e-05, + "loss": 5.2434, + "step": 12365 + }, + { + "epoch": 0.6838032061912659, + "grad_norm": 4.056967735290527, + "learning_rate": 1.6259238203524733e-05, + "loss": 4.993, + "step": 12370 + }, + { + "epoch": 0.6840796019900498, + "grad_norm": 3.637042284011841, + "learning_rate": 1.6245025582717454e-05, + "loss": 5.5494, + "step": 12375 + }, + { + "epoch": 0.6843559977888336, + "grad_norm": 4.664302825927734, + "learning_rate": 1.6230812961910178e-05, + "loss": 4.9485, + "step": 12380 + }, + { + "epoch": 0.6846323935876175, + "grad_norm": 3.098987340927124, + "learning_rate": 1.62166003411029e-05, + "loss": 4.8687, + "step": 12385 + }, + { + "epoch": 0.6849087893864013, + "grad_norm": 3.432152509689331, + "learning_rate": 1.6202387720295623e-05, + "loss": 4.7856, + "step": 12390 + }, + { + "epoch": 0.6851851851851852, + "grad_norm": 3.564708709716797, + "learning_rate": 1.6188175099488347e-05, + "loss": 5.0981, + "step": 12395 + }, + { + "epoch": 0.685461580983969, + "grad_norm": 3.6227316856384277, + "learning_rate": 1.617396247868107e-05, + "loss": 5.0038, + "step": 12400 + }, + { + "epoch": 0.685737976782753, + "grad_norm": 3.641265392303467, + "learning_rate": 1.6159749857873794e-05, + "loss": 4.9751, + "step": 12405 + }, + { + "epoch": 0.6860143725815367, + "grad_norm": 3.2415549755096436, + "learning_rate": 1.6145537237066515e-05, + "loss": 5.1277, + "step": 12410 + }, + { + "epoch": 0.6862907683803207, + "grad_norm": 3.59389591217041, + "learning_rate": 1.613132461625924e-05, + "loss": 5.1952, + "step": 12415 + }, + { + "epoch": 0.6865671641791045, + "grad_norm": 3.559694528579712, + "learning_rate": 1.611711199545196e-05, + "loss": 4.9343, + "step": 12420 + }, + { + "epoch": 0.6868435599778884, + "grad_norm": 3.0507447719573975, + "learning_rate": 1.6102899374644687e-05, + "loss": 5.0918, + "step": 12425 + }, + { + "epoch": 0.6871199557766722, + "grad_norm": 3.375110149383545, + "learning_rate": 1.6088686753837408e-05, + "loss": 4.78, + "step": 12430 + }, + { + "epoch": 0.6873963515754561, + "grad_norm": 4.308310031890869, + "learning_rate": 1.607447413303013e-05, + "loss": 4.9678, + "step": 12435 + }, + { + "epoch": 0.6876727473742399, + "grad_norm": 4.061061382293701, + "learning_rate": 1.6060261512222856e-05, + "loss": 4.9967, + "step": 12440 + }, + { + "epoch": 0.6879491431730238, + "grad_norm": 3.5668656826019287, + "learning_rate": 1.6046048891415576e-05, + "loss": 4.695, + "step": 12445 + }, + { + "epoch": 0.6882255389718076, + "grad_norm": 3.55816650390625, + "learning_rate": 1.60318362706083e-05, + "loss": 5.0108, + "step": 12450 + }, + { + "epoch": 0.6885019347705915, + "grad_norm": 3.516140937805176, + "learning_rate": 1.6017623649801024e-05, + "loss": 5.0607, + "step": 12455 + }, + { + "epoch": 0.6887783305693753, + "grad_norm": 3.5040078163146973, + "learning_rate": 1.6003411028993748e-05, + "loss": 5.2791, + "step": 12460 + }, + { + "epoch": 0.6890547263681592, + "grad_norm": 3.3454535007476807, + "learning_rate": 1.5989198408186472e-05, + "loss": 4.9042, + "step": 12465 + }, + { + "epoch": 0.689331122166943, + "grad_norm": 4.622253894805908, + "learning_rate": 1.5974985787379193e-05, + "loss": 4.8014, + "step": 12470 + }, + { + "epoch": 0.689607517965727, + "grad_norm": 4.814120292663574, + "learning_rate": 1.5960773166571917e-05, + "loss": 5.0986, + "step": 12475 + }, + { + "epoch": 0.6898839137645107, + "grad_norm": 3.6000027656555176, + "learning_rate": 1.594656054576464e-05, + "loss": 5.2295, + "step": 12480 + }, + { + "epoch": 0.6901603095632947, + "grad_norm": 3.4990899562835693, + "learning_rate": 1.5932347924957365e-05, + "loss": 5.2095, + "step": 12485 + }, + { + "epoch": 0.6904367053620785, + "grad_norm": 4.27670431137085, + "learning_rate": 1.5918135304150085e-05, + "loss": 5.3237, + "step": 12490 + }, + { + "epoch": 0.6907131011608624, + "grad_norm": 4.3061394691467285, + "learning_rate": 1.590392268334281e-05, + "loss": 5.0581, + "step": 12495 + }, + { + "epoch": 0.6909894969596462, + "grad_norm": 3.4813592433929443, + "learning_rate": 1.5889710062535533e-05, + "loss": 5.244, + "step": 12500 + }, + { + "epoch": 0.6912658927584301, + "grad_norm": 4.107872486114502, + "learning_rate": 1.5875497441728254e-05, + "loss": 4.9351, + "step": 12505 + }, + { + "epoch": 0.6915422885572139, + "grad_norm": 3.8972761631011963, + "learning_rate": 1.586128482092098e-05, + "loss": 4.954, + "step": 12510 + }, + { + "epoch": 0.6918186843559978, + "grad_norm": 3.9529995918273926, + "learning_rate": 1.5847072200113702e-05, + "loss": 4.553, + "step": 12515 + }, + { + "epoch": 0.6920950801547816, + "grad_norm": 3.2394180297851562, + "learning_rate": 1.5832859579306426e-05, + "loss": 4.8631, + "step": 12520 + }, + { + "epoch": 0.6923714759535655, + "grad_norm": 3.906923294067383, + "learning_rate": 1.5818646958499146e-05, + "loss": 4.655, + "step": 12525 + }, + { + "epoch": 0.6926478717523493, + "grad_norm": 3.0384624004364014, + "learning_rate": 1.580443433769187e-05, + "loss": 4.8143, + "step": 12530 + }, + { + "epoch": 0.6929242675511332, + "grad_norm": 3.3072428703308105, + "learning_rate": 1.5790221716884594e-05, + "loss": 5.1887, + "step": 12535 + }, + { + "epoch": 0.693200663349917, + "grad_norm": 3.6409823894500732, + "learning_rate": 1.5776009096077318e-05, + "loss": 5.3076, + "step": 12540 + }, + { + "epoch": 0.693477059148701, + "grad_norm": 4.234898567199707, + "learning_rate": 1.5761796475270042e-05, + "loss": 5.037, + "step": 12545 + }, + { + "epoch": 0.6937534549474847, + "grad_norm": 3.5586225986480713, + "learning_rate": 1.5747583854462763e-05, + "loss": 5.0659, + "step": 12550 + }, + { + "epoch": 0.6940298507462687, + "grad_norm": 3.656480073928833, + "learning_rate": 1.5733371233655487e-05, + "loss": 5.1345, + "step": 12555 + }, + { + "epoch": 0.6943062465450525, + "grad_norm": 3.5738766193389893, + "learning_rate": 1.5719158612848207e-05, + "loss": 4.8976, + "step": 12560 + }, + { + "epoch": 0.6945826423438364, + "grad_norm": 3.6187734603881836, + "learning_rate": 1.5704945992040935e-05, + "loss": 4.7864, + "step": 12565 + }, + { + "epoch": 0.6948590381426202, + "grad_norm": 3.634728193283081, + "learning_rate": 1.5690733371233655e-05, + "loss": 5.1336, + "step": 12570 + }, + { + "epoch": 0.6951354339414041, + "grad_norm": 3.8284621238708496, + "learning_rate": 1.567652075042638e-05, + "loss": 5.0473, + "step": 12575 + }, + { + "epoch": 0.695411829740188, + "grad_norm": 3.628915786743164, + "learning_rate": 1.5662308129619103e-05, + "loss": 4.8755, + "step": 12580 + }, + { + "epoch": 0.6956882255389718, + "grad_norm": 4.438342571258545, + "learning_rate": 1.5648095508811824e-05, + "loss": 4.7324, + "step": 12585 + }, + { + "epoch": 0.6959646213377557, + "grad_norm": 3.574575662612915, + "learning_rate": 1.563388288800455e-05, + "loss": 5.0009, + "step": 12590 + }, + { + "epoch": 0.6962410171365395, + "grad_norm": 3.5201807022094727, + "learning_rate": 1.5619670267197272e-05, + "loss": 5.0128, + "step": 12595 + }, + { + "epoch": 0.6965174129353234, + "grad_norm": 3.0795626640319824, + "learning_rate": 1.5605457646389996e-05, + "loss": 4.9838, + "step": 12600 + }, + { + "epoch": 0.6967938087341072, + "grad_norm": 3.3544068336486816, + "learning_rate": 1.5591245025582716e-05, + "loss": 5.1475, + "step": 12605 + }, + { + "epoch": 0.6970702045328911, + "grad_norm": 3.4172658920288086, + "learning_rate": 1.557703240477544e-05, + "loss": 4.7901, + "step": 12610 + }, + { + "epoch": 0.697346600331675, + "grad_norm": 3.966212511062622, + "learning_rate": 1.5562819783968164e-05, + "loss": 5.0707, + "step": 12615 + }, + { + "epoch": 0.6976229961304589, + "grad_norm": 3.324779748916626, + "learning_rate": 1.554860716316089e-05, + "loss": 5.0197, + "step": 12620 + }, + { + "epoch": 0.6978993919292427, + "grad_norm": 3.247331142425537, + "learning_rate": 1.5534394542353612e-05, + "loss": 5.3125, + "step": 12625 + }, + { + "epoch": 0.6981757877280266, + "grad_norm": 2.768170118331909, + "learning_rate": 1.5520181921546333e-05, + "loss": 4.8045, + "step": 12630 + }, + { + "epoch": 0.6984521835268104, + "grad_norm": 2.8270652294158936, + "learning_rate": 1.5505969300739057e-05, + "loss": 5.0571, + "step": 12635 + }, + { + "epoch": 0.6987285793255943, + "grad_norm": 3.471879005432129, + "learning_rate": 1.549175667993178e-05, + "loss": 4.9667, + "step": 12640 + }, + { + "epoch": 0.6990049751243781, + "grad_norm": 3.826054811477661, + "learning_rate": 1.5477544059124505e-05, + "loss": 4.9908, + "step": 12645 + }, + { + "epoch": 0.699281370923162, + "grad_norm": 3.6149983406066895, + "learning_rate": 1.546333143831723e-05, + "loss": 4.8681, + "step": 12650 + }, + { + "epoch": 0.6995577667219458, + "grad_norm": 4.132063388824463, + "learning_rate": 1.544911881750995e-05, + "loss": 5.0986, + "step": 12655 + }, + { + "epoch": 0.6998341625207297, + "grad_norm": 3.410290002822876, + "learning_rate": 1.5434906196702673e-05, + "loss": 5.0152, + "step": 12660 + }, + { + "epoch": 0.7001105583195135, + "grad_norm": 3.215104103088379, + "learning_rate": 1.5420693575895394e-05, + "loss": 4.7577, + "step": 12665 + }, + { + "epoch": 0.7003869541182974, + "grad_norm": 3.521066427230835, + "learning_rate": 1.5406480955088118e-05, + "loss": 4.9928, + "step": 12670 + }, + { + "epoch": 0.7006633499170812, + "grad_norm": 3.641317367553711, + "learning_rate": 1.5392268334280842e-05, + "loss": 5.1642, + "step": 12675 + }, + { + "epoch": 0.7009397457158651, + "grad_norm": 3.365527629852295, + "learning_rate": 1.5378055713473566e-05, + "loss": 4.9449, + "step": 12680 + }, + { + "epoch": 0.701216141514649, + "grad_norm": 3.430457592010498, + "learning_rate": 1.536384309266629e-05, + "loss": 5.123, + "step": 12685 + }, + { + "epoch": 0.7014925373134329, + "grad_norm": 3.236186981201172, + "learning_rate": 1.534963047185901e-05, + "loss": 5.1541, + "step": 12690 + }, + { + "epoch": 0.7017689331122167, + "grad_norm": 3.792572021484375, + "learning_rate": 1.5335417851051734e-05, + "loss": 5.1397, + "step": 12695 + }, + { + "epoch": 0.7020453289110006, + "grad_norm": 3.7908694744110107, + "learning_rate": 1.532120523024446e-05, + "loss": 5.0269, + "step": 12700 + }, + { + "epoch": 0.7023217247097844, + "grad_norm": 3.2599024772644043, + "learning_rate": 1.5306992609437182e-05, + "loss": 4.7786, + "step": 12705 + }, + { + "epoch": 0.7025981205085683, + "grad_norm": 3.5617377758026123, + "learning_rate": 1.5292779988629903e-05, + "loss": 5.0488, + "step": 12710 + }, + { + "epoch": 0.7028745163073521, + "grad_norm": 3.4966659545898438, + "learning_rate": 1.5278567367822627e-05, + "loss": 5.0739, + "step": 12715 + }, + { + "epoch": 0.703150912106136, + "grad_norm": 3.6931207180023193, + "learning_rate": 1.526435474701535e-05, + "loss": 4.7299, + "step": 12720 + }, + { + "epoch": 0.7034273079049198, + "grad_norm": 3.201852560043335, + "learning_rate": 1.5250142126208072e-05, + "loss": 5.1792, + "step": 12725 + }, + { + "epoch": 0.7037037037037037, + "grad_norm": 3.669928550720215, + "learning_rate": 1.5235929505400797e-05, + "loss": 4.8751, + "step": 12730 + }, + { + "epoch": 0.7039800995024875, + "grad_norm": 3.639188766479492, + "learning_rate": 1.5221716884593521e-05, + "loss": 4.7316, + "step": 12735 + }, + { + "epoch": 0.7042564953012714, + "grad_norm": 3.298006057739258, + "learning_rate": 1.5207504263786243e-05, + "loss": 4.7265, + "step": 12740 + }, + { + "epoch": 0.7045328911000552, + "grad_norm": 4.186183452606201, + "learning_rate": 1.5193291642978966e-05, + "loss": 4.6542, + "step": 12745 + }, + { + "epoch": 0.7048092868988391, + "grad_norm": 3.1352524757385254, + "learning_rate": 1.5179079022171688e-05, + "loss": 5.1087, + "step": 12750 + }, + { + "epoch": 0.705085682697623, + "grad_norm": 3.946307420730591, + "learning_rate": 1.516486640136441e-05, + "loss": 4.9225, + "step": 12755 + }, + { + "epoch": 0.7053620784964069, + "grad_norm": 3.4466960430145264, + "learning_rate": 1.5150653780557136e-05, + "loss": 5.0924, + "step": 12760 + }, + { + "epoch": 0.7056384742951907, + "grad_norm": 3.4541025161743164, + "learning_rate": 1.5136441159749858e-05, + "loss": 4.7316, + "step": 12765 + }, + { + "epoch": 0.7059148700939746, + "grad_norm": 3.1319077014923096, + "learning_rate": 1.5122228538942582e-05, + "loss": 5.0303, + "step": 12770 + }, + { + "epoch": 0.7061912658927584, + "grad_norm": 3.6074001789093018, + "learning_rate": 1.5108015918135305e-05, + "loss": 4.9892, + "step": 12775 + }, + { + "epoch": 0.7064676616915423, + "grad_norm": 3.2462453842163086, + "learning_rate": 1.5093803297328027e-05, + "loss": 4.9228, + "step": 12780 + }, + { + "epoch": 0.7067440574903261, + "grad_norm": 4.183916091918945, + "learning_rate": 1.5079590676520753e-05, + "loss": 5.0513, + "step": 12785 + }, + { + "epoch": 0.70702045328911, + "grad_norm": 4.227228164672852, + "learning_rate": 1.5065378055713475e-05, + "loss": 5.2663, + "step": 12790 + }, + { + "epoch": 0.7072968490878938, + "grad_norm": 4.704550743103027, + "learning_rate": 1.5051165434906197e-05, + "loss": 5.0818, + "step": 12795 + }, + { + "epoch": 0.7075732448866777, + "grad_norm": 3.2340505123138428, + "learning_rate": 1.503695281409892e-05, + "loss": 5.1363, + "step": 12800 + }, + { + "epoch": 0.7078496406854616, + "grad_norm": 4.0538153648376465, + "learning_rate": 1.5022740193291643e-05, + "loss": 5.6689, + "step": 12805 + }, + { + "epoch": 0.7081260364842454, + "grad_norm": 4.062292098999023, + "learning_rate": 1.5008527572484366e-05, + "loss": 5.157, + "step": 12810 + }, + { + "epoch": 0.7084024322830293, + "grad_norm": 3.2612810134887695, + "learning_rate": 1.4994314951677091e-05, + "loss": 4.9579, + "step": 12815 + }, + { + "epoch": 0.7086788280818132, + "grad_norm": 3.235447406768799, + "learning_rate": 1.4980102330869814e-05, + "loss": 5.1034, + "step": 12820 + }, + { + "epoch": 0.7089552238805971, + "grad_norm": 3.9662389755249023, + "learning_rate": 1.4965889710062536e-05, + "loss": 4.9475, + "step": 12825 + }, + { + "epoch": 0.7092316196793809, + "grad_norm": 3.0024068355560303, + "learning_rate": 1.4951677089255258e-05, + "loss": 4.9537, + "step": 12830 + }, + { + "epoch": 0.7095080154781648, + "grad_norm": 3.502521514892578, + "learning_rate": 1.4937464468447982e-05, + "loss": 4.9855, + "step": 12835 + }, + { + "epoch": 0.7097844112769486, + "grad_norm": 3.6317951679229736, + "learning_rate": 1.4923251847640706e-05, + "loss": 5.1954, + "step": 12840 + }, + { + "epoch": 0.7100608070757325, + "grad_norm": 3.3001856803894043, + "learning_rate": 1.490903922683343e-05, + "loss": 4.9513, + "step": 12845 + }, + { + "epoch": 0.7103372028745163, + "grad_norm": 3.484205961227417, + "learning_rate": 1.4894826606026152e-05, + "loss": 4.6211, + "step": 12850 + }, + { + "epoch": 0.7106135986733002, + "grad_norm": 3.8079354763031006, + "learning_rate": 1.4880613985218875e-05, + "loss": 5.0842, + "step": 12855 + }, + { + "epoch": 0.710889994472084, + "grad_norm": 3.5550713539123535, + "learning_rate": 1.4866401364411597e-05, + "loss": 4.981, + "step": 12860 + }, + { + "epoch": 0.7111663902708679, + "grad_norm": 4.202615261077881, + "learning_rate": 1.485218874360432e-05, + "loss": 5.1481, + "step": 12865 + }, + { + "epoch": 0.7114427860696517, + "grad_norm": 3.486539125442505, + "learning_rate": 1.4837976122797045e-05, + "loss": 5.0235, + "step": 12870 + }, + { + "epoch": 0.7117191818684356, + "grad_norm": 3.639721632003784, + "learning_rate": 1.4823763501989767e-05, + "loss": 4.9426, + "step": 12875 + }, + { + "epoch": 0.7119955776672194, + "grad_norm": 3.9784390926361084, + "learning_rate": 1.4809550881182491e-05, + "loss": 5.0809, + "step": 12880 + }, + { + "epoch": 0.7122719734660033, + "grad_norm": 2.89168643951416, + "learning_rate": 1.4795338260375213e-05, + "loss": 5.0391, + "step": 12885 + }, + { + "epoch": 0.7125483692647872, + "grad_norm": 4.273104667663574, + "learning_rate": 1.4781125639567936e-05, + "loss": 4.8018, + "step": 12890 + }, + { + "epoch": 0.7128247650635711, + "grad_norm": 3.9629507064819336, + "learning_rate": 1.4766913018760661e-05, + "loss": 5.0206, + "step": 12895 + }, + { + "epoch": 0.7131011608623549, + "grad_norm": 3.7056665420532227, + "learning_rate": 1.4752700397953384e-05, + "loss": 5.1737, + "step": 12900 + }, + { + "epoch": 0.7133775566611388, + "grad_norm": 3.9631214141845703, + "learning_rate": 1.4738487777146106e-05, + "loss": 5.4375, + "step": 12905 + }, + { + "epoch": 0.7136539524599226, + "grad_norm": 4.301235198974609, + "learning_rate": 1.472427515633883e-05, + "loss": 5.0811, + "step": 12910 + }, + { + "epoch": 0.7139303482587065, + "grad_norm": 3.4924840927124023, + "learning_rate": 1.4710062535531552e-05, + "loss": 5.0251, + "step": 12915 + }, + { + "epoch": 0.7142067440574903, + "grad_norm": 3.845777750015259, + "learning_rate": 1.4695849914724275e-05, + "loss": 4.9661, + "step": 12920 + }, + { + "epoch": 0.7144831398562742, + "grad_norm": 3.8963284492492676, + "learning_rate": 1.4681637293917e-05, + "loss": 5.0677, + "step": 12925 + }, + { + "epoch": 0.714759535655058, + "grad_norm": 3.3300371170043945, + "learning_rate": 1.4667424673109723e-05, + "loss": 4.565, + "step": 12930 + }, + { + "epoch": 0.7150359314538419, + "grad_norm": 3.4866998195648193, + "learning_rate": 1.4653212052302445e-05, + "loss": 4.9402, + "step": 12935 + }, + { + "epoch": 0.7153123272526257, + "grad_norm": 3.7221293449401855, + "learning_rate": 1.4638999431495167e-05, + "loss": 5.0538, + "step": 12940 + }, + { + "epoch": 0.7155887230514096, + "grad_norm": 3.6549923419952393, + "learning_rate": 1.4624786810687891e-05, + "loss": 5.0269, + "step": 12945 + }, + { + "epoch": 0.7158651188501934, + "grad_norm": 3.261334180831909, + "learning_rate": 1.4610574189880615e-05, + "loss": 5.0511, + "step": 12950 + }, + { + "epoch": 0.7161415146489774, + "grad_norm": 3.2248923778533936, + "learning_rate": 1.4596361569073339e-05, + "loss": 5.0044, + "step": 12955 + }, + { + "epoch": 0.7164179104477612, + "grad_norm": 3.5878825187683105, + "learning_rate": 1.4582148948266061e-05, + "loss": 5.0746, + "step": 12960 + }, + { + "epoch": 0.7166943062465451, + "grad_norm": 3.1510488986968994, + "learning_rate": 1.4567936327458784e-05, + "loss": 4.775, + "step": 12965 + }, + { + "epoch": 0.7169707020453289, + "grad_norm": 3.1844141483306885, + "learning_rate": 1.4553723706651506e-05, + "loss": 4.803, + "step": 12970 + }, + { + "epoch": 0.7172470978441128, + "grad_norm": 3.417433977127075, + "learning_rate": 1.453951108584423e-05, + "loss": 4.932, + "step": 12975 + }, + { + "epoch": 0.7175234936428966, + "grad_norm": 3.5905191898345947, + "learning_rate": 1.4525298465036954e-05, + "loss": 5.1135, + "step": 12980 + }, + { + "epoch": 0.7177998894416805, + "grad_norm": 3.180964231491089, + "learning_rate": 1.4511085844229678e-05, + "loss": 4.8405, + "step": 12985 + }, + { + "epoch": 0.7180762852404643, + "grad_norm": 3.3651866912841797, + "learning_rate": 1.44968732234224e-05, + "loss": 4.8142, + "step": 12990 + }, + { + "epoch": 0.7183526810392482, + "grad_norm": 3.8147387504577637, + "learning_rate": 1.4482660602615122e-05, + "loss": 5.152, + "step": 12995 + }, + { + "epoch": 0.718629076838032, + "grad_norm": 3.135228157043457, + "learning_rate": 1.4468447981807845e-05, + "loss": 4.7615, + "step": 13000 + }, + { + "epoch": 0.7189054726368159, + "grad_norm": 3.4666500091552734, + "learning_rate": 1.445423536100057e-05, + "loss": 5.238, + "step": 13005 + }, + { + "epoch": 0.7191818684355997, + "grad_norm": 3.4540812969207764, + "learning_rate": 1.4440022740193293e-05, + "loss": 5.1853, + "step": 13010 + }, + { + "epoch": 0.7194582642343836, + "grad_norm": 3.278425455093384, + "learning_rate": 1.4425810119386015e-05, + "loss": 5.0469, + "step": 13015 + }, + { + "epoch": 0.7197346600331676, + "grad_norm": 4.068847179412842, + "learning_rate": 1.4411597498578739e-05, + "loss": 4.8051, + "step": 13020 + }, + { + "epoch": 0.7200110558319514, + "grad_norm": 4.303192615509033, + "learning_rate": 1.4397384877771461e-05, + "loss": 4.8124, + "step": 13025 + }, + { + "epoch": 0.7202874516307353, + "grad_norm": 3.8566951751708984, + "learning_rate": 1.4383172256964183e-05, + "loss": 4.9964, + "step": 13030 + }, + { + "epoch": 0.7205638474295191, + "grad_norm": 3.5754945278167725, + "learning_rate": 1.4368959636156909e-05, + "loss": 5.055, + "step": 13035 + }, + { + "epoch": 0.720840243228303, + "grad_norm": 4.132562637329102, + "learning_rate": 1.4354747015349631e-05, + "loss": 5.0778, + "step": 13040 + }, + { + "epoch": 0.7211166390270868, + "grad_norm": 3.1052451133728027, + "learning_rate": 1.4340534394542354e-05, + "loss": 4.8714, + "step": 13045 + }, + { + "epoch": 0.7213930348258707, + "grad_norm": 3.893859386444092, + "learning_rate": 1.4326321773735078e-05, + "loss": 5.101, + "step": 13050 + }, + { + "epoch": 0.7216694306246545, + "grad_norm": 3.588839054107666, + "learning_rate": 1.43121091529278e-05, + "loss": 4.6311, + "step": 13055 + }, + { + "epoch": 0.7219458264234384, + "grad_norm": 3.061183214187622, + "learning_rate": 1.4297896532120526e-05, + "loss": 4.9162, + "step": 13060 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 3.5575475692749023, + "learning_rate": 1.4283683911313248e-05, + "loss": 5.3776, + "step": 13065 + }, + { + "epoch": 0.7224986180210061, + "grad_norm": 3.373143196105957, + "learning_rate": 1.426947129050597e-05, + "loss": 4.9563, + "step": 13070 + }, + { + "epoch": 0.7227750138197899, + "grad_norm": 4.447892189025879, + "learning_rate": 1.4255258669698693e-05, + "loss": 4.8872, + "step": 13075 + }, + { + "epoch": 0.7230514096185738, + "grad_norm": 4.527839660644531, + "learning_rate": 1.4241046048891415e-05, + "loss": 4.99, + "step": 13080 + }, + { + "epoch": 0.7233278054173576, + "grad_norm": 3.5849502086639404, + "learning_rate": 1.4226833428084139e-05, + "loss": 4.6041, + "step": 13085 + }, + { + "epoch": 0.7236042012161416, + "grad_norm": 3.7297580242156982, + "learning_rate": 1.4212620807276863e-05, + "loss": 4.948, + "step": 13090 + }, + { + "epoch": 0.7238805970149254, + "grad_norm": 3.810117721557617, + "learning_rate": 1.4198408186469587e-05, + "loss": 4.5659, + "step": 13095 + }, + { + "epoch": 0.7241569928137093, + "grad_norm": 3.6611037254333496, + "learning_rate": 1.4184195565662309e-05, + "loss": 4.7651, + "step": 13100 + }, + { + "epoch": 0.7244333886124931, + "grad_norm": 3.81868314743042, + "learning_rate": 1.4169982944855031e-05, + "loss": 4.8415, + "step": 13105 + }, + { + "epoch": 0.724709784411277, + "grad_norm": 3.0699076652526855, + "learning_rate": 1.4155770324047754e-05, + "loss": 4.8364, + "step": 13110 + }, + { + "epoch": 0.7249861802100608, + "grad_norm": 3.566793918609619, + "learning_rate": 1.414155770324048e-05, + "loss": 4.7472, + "step": 13115 + }, + { + "epoch": 0.7252625760088447, + "grad_norm": 3.110363721847534, + "learning_rate": 1.4127345082433202e-05, + "loss": 4.6858, + "step": 13120 + }, + { + "epoch": 0.7255389718076285, + "grad_norm": 3.3906173706054688, + "learning_rate": 1.4113132461625926e-05, + "loss": 4.9415, + "step": 13125 + }, + { + "epoch": 0.7258153676064124, + "grad_norm": 3.598484754562378, + "learning_rate": 1.4098919840818648e-05, + "loss": 4.9086, + "step": 13130 + }, + { + "epoch": 0.7260917634051962, + "grad_norm": 3.2210891246795654, + "learning_rate": 1.408470722001137e-05, + "loss": 5.1958, + "step": 13135 + }, + { + "epoch": 0.7263681592039801, + "grad_norm": 3.400056838989258, + "learning_rate": 1.4070494599204092e-05, + "loss": 5.2228, + "step": 13140 + }, + { + "epoch": 0.7266445550027639, + "grad_norm": 3.5063843727111816, + "learning_rate": 1.4056281978396818e-05, + "loss": 4.9669, + "step": 13145 + }, + { + "epoch": 0.7269209508015478, + "grad_norm": 3.8449692726135254, + "learning_rate": 1.404206935758954e-05, + "loss": 4.8901, + "step": 13150 + }, + { + "epoch": 0.7271973466003316, + "grad_norm": 3.1058237552642822, + "learning_rate": 1.4027856736782263e-05, + "loss": 5.0105, + "step": 13155 + }, + { + "epoch": 0.7274737423991156, + "grad_norm": 3.9088826179504395, + "learning_rate": 1.4013644115974987e-05, + "loss": 5.0905, + "step": 13160 + }, + { + "epoch": 0.7277501381978994, + "grad_norm": 3.3987643718719482, + "learning_rate": 1.3999431495167709e-05, + "loss": 4.5506, + "step": 13165 + }, + { + "epoch": 0.7280265339966833, + "grad_norm": 3.4407613277435303, + "learning_rate": 1.3985218874360435e-05, + "loss": 4.8973, + "step": 13170 + }, + { + "epoch": 0.7283029297954671, + "grad_norm": 3.590932607650757, + "learning_rate": 1.3971006253553157e-05, + "loss": 5.0722, + "step": 13175 + }, + { + "epoch": 0.728579325594251, + "grad_norm": 3.1984472274780273, + "learning_rate": 1.3956793632745879e-05, + "loss": 4.9033, + "step": 13180 + }, + { + "epoch": 0.7288557213930348, + "grad_norm": 3.5542407035827637, + "learning_rate": 1.3942581011938601e-05, + "loss": 4.8929, + "step": 13185 + }, + { + "epoch": 0.7291321171918187, + "grad_norm": 3.841609001159668, + "learning_rate": 1.3928368391131324e-05, + "loss": 5.0927, + "step": 13190 + }, + { + "epoch": 0.7294085129906025, + "grad_norm": 3.5699667930603027, + "learning_rate": 1.3914155770324048e-05, + "loss": 5.0084, + "step": 13195 + }, + { + "epoch": 0.7296849087893864, + "grad_norm": 3.561516046524048, + "learning_rate": 1.3899943149516773e-05, + "loss": 5.2851, + "step": 13200 + }, + { + "epoch": 0.7299613045881702, + "grad_norm": 3.833688497543335, + "learning_rate": 1.3885730528709496e-05, + "loss": 5.1715, + "step": 13205 + }, + { + "epoch": 0.7302377003869541, + "grad_norm": 3.255157232284546, + "learning_rate": 1.3871517907902218e-05, + "loss": 4.768, + "step": 13210 + }, + { + "epoch": 0.7305140961857379, + "grad_norm": 3.3706374168395996, + "learning_rate": 1.385730528709494e-05, + "loss": 4.8445, + "step": 13215 + }, + { + "epoch": 0.7307904919845218, + "grad_norm": 3.0012950897216797, + "learning_rate": 1.3843092666287662e-05, + "loss": 4.8293, + "step": 13220 + }, + { + "epoch": 0.7310668877833056, + "grad_norm": 3.7579739093780518, + "learning_rate": 1.3828880045480388e-05, + "loss": 5.1837, + "step": 13225 + }, + { + "epoch": 0.7313432835820896, + "grad_norm": 3.6330161094665527, + "learning_rate": 1.381466742467311e-05, + "loss": 4.9532, + "step": 13230 + }, + { + "epoch": 0.7316196793808734, + "grad_norm": 4.080511569976807, + "learning_rate": 1.3800454803865834e-05, + "loss": 5.0781, + "step": 13235 + }, + { + "epoch": 0.7318960751796573, + "grad_norm": 3.782904624938965, + "learning_rate": 1.3786242183058557e-05, + "loss": 4.9824, + "step": 13240 + }, + { + "epoch": 0.7321724709784412, + "grad_norm": 3.812208414077759, + "learning_rate": 1.3772029562251279e-05, + "loss": 5.1982, + "step": 13245 + }, + { + "epoch": 0.732448866777225, + "grad_norm": 3.541715145111084, + "learning_rate": 1.3757816941444001e-05, + "loss": 4.945, + "step": 13250 + }, + { + "epoch": 0.7327252625760089, + "grad_norm": 3.311810255050659, + "learning_rate": 1.3743604320636727e-05, + "loss": 5.0986, + "step": 13255 + }, + { + "epoch": 0.7330016583747927, + "grad_norm": 4.002370357513428, + "learning_rate": 1.372939169982945e-05, + "loss": 4.7093, + "step": 13260 + }, + { + "epoch": 0.7332780541735766, + "grad_norm": 3.475691556930542, + "learning_rate": 1.3715179079022172e-05, + "loss": 5.043, + "step": 13265 + }, + { + "epoch": 0.7335544499723604, + "grad_norm": 3.538708448410034, + "learning_rate": 1.3700966458214896e-05, + "loss": 4.9838, + "step": 13270 + }, + { + "epoch": 0.7338308457711443, + "grad_norm": 3.6737594604492188, + "learning_rate": 1.3686753837407618e-05, + "loss": 4.9001, + "step": 13275 + }, + { + "epoch": 0.7341072415699281, + "grad_norm": 3.5421972274780273, + "learning_rate": 1.3672541216600343e-05, + "loss": 4.9089, + "step": 13280 + }, + { + "epoch": 0.734383637368712, + "grad_norm": 3.507235050201416, + "learning_rate": 1.3658328595793066e-05, + "loss": 5.0143, + "step": 13285 + }, + { + "epoch": 0.7346600331674958, + "grad_norm": 2.759843587875366, + "learning_rate": 1.3644115974985788e-05, + "loss": 4.9846, + "step": 13290 + }, + { + "epoch": 0.7349364289662798, + "grad_norm": 3.151954412460327, + "learning_rate": 1.362990335417851e-05, + "loss": 4.933, + "step": 13295 + }, + { + "epoch": 0.7352128247650636, + "grad_norm": 2.9206533432006836, + "learning_rate": 1.3615690733371234e-05, + "loss": 4.7963, + "step": 13300 + }, + { + "epoch": 0.7354892205638475, + "grad_norm": 3.8691372871398926, + "learning_rate": 1.3601478112563957e-05, + "loss": 5.2145, + "step": 13305 + }, + { + "epoch": 0.7357656163626313, + "grad_norm": 3.0782530307769775, + "learning_rate": 1.3587265491756682e-05, + "loss": 4.6068, + "step": 13310 + }, + { + "epoch": 0.7360420121614152, + "grad_norm": 3.9192464351654053, + "learning_rate": 1.3573052870949405e-05, + "loss": 5.0152, + "step": 13315 + }, + { + "epoch": 0.736318407960199, + "grad_norm": 5.0663371086120605, + "learning_rate": 1.3558840250142127e-05, + "loss": 5.1813, + "step": 13320 + }, + { + "epoch": 0.7365948037589829, + "grad_norm": 3.4639241695404053, + "learning_rate": 1.3544627629334849e-05, + "loss": 5.0714, + "step": 13325 + }, + { + "epoch": 0.7368711995577667, + "grad_norm": 3.764646053314209, + "learning_rate": 1.3530415008527571e-05, + "loss": 4.9129, + "step": 13330 + }, + { + "epoch": 0.7371475953565506, + "grad_norm": 3.288600206375122, + "learning_rate": 1.3516202387720297e-05, + "loss": 4.8982, + "step": 13335 + }, + { + "epoch": 0.7374239911553344, + "grad_norm": 3.8759982585906982, + "learning_rate": 1.350198976691302e-05, + "loss": 5.1059, + "step": 13340 + }, + { + "epoch": 0.7377003869541183, + "grad_norm": 3.3489646911621094, + "learning_rate": 1.3487777146105743e-05, + "loss": 5.1768, + "step": 13345 + }, + { + "epoch": 0.7379767827529021, + "grad_norm": 3.005626916885376, + "learning_rate": 1.3473564525298466e-05, + "loss": 5.0332, + "step": 13350 + }, + { + "epoch": 0.738253178551686, + "grad_norm": 3.416574239730835, + "learning_rate": 1.3459351904491188e-05, + "loss": 5.0185, + "step": 13355 + }, + { + "epoch": 0.7385295743504698, + "grad_norm": 3.5634119510650635, + "learning_rate": 1.344513928368391e-05, + "loss": 4.7656, + "step": 13360 + }, + { + "epoch": 0.7388059701492538, + "grad_norm": 3.0939366817474365, + "learning_rate": 1.3430926662876636e-05, + "loss": 4.8285, + "step": 13365 + }, + { + "epoch": 0.7390823659480376, + "grad_norm": 3.3977231979370117, + "learning_rate": 1.3416714042069358e-05, + "loss": 4.9093, + "step": 13370 + }, + { + "epoch": 0.7393587617468215, + "grad_norm": 3.6465513706207275, + "learning_rate": 1.3402501421262082e-05, + "loss": 5.0178, + "step": 13375 + }, + { + "epoch": 0.7396351575456053, + "grad_norm": 3.207108974456787, + "learning_rate": 1.3388288800454804e-05, + "loss": 5.2148, + "step": 13380 + }, + { + "epoch": 0.7399115533443892, + "grad_norm": 3.185753107070923, + "learning_rate": 1.3374076179647527e-05, + "loss": 5.0929, + "step": 13385 + }, + { + "epoch": 0.740187949143173, + "grad_norm": 4.584200859069824, + "learning_rate": 1.3359863558840252e-05, + "loss": 4.651, + "step": 13390 + }, + { + "epoch": 0.7404643449419569, + "grad_norm": 3.067399024963379, + "learning_rate": 1.3345650938032975e-05, + "loss": 5.0875, + "step": 13395 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 3.611589193344116, + "learning_rate": 1.3331438317225697e-05, + "loss": 4.725, + "step": 13400 + }, + { + "epoch": 0.7410171365395246, + "grad_norm": 3.321446418762207, + "learning_rate": 1.331722569641842e-05, + "loss": 4.8858, + "step": 13405 + }, + { + "epoch": 0.7412935323383084, + "grad_norm": 3.519653797149658, + "learning_rate": 1.3303013075611143e-05, + "loss": 4.645, + "step": 13410 + }, + { + "epoch": 0.7415699281370923, + "grad_norm": 3.325186014175415, + "learning_rate": 1.3288800454803865e-05, + "loss": 4.8125, + "step": 13415 + }, + { + "epoch": 0.7418463239358761, + "grad_norm": 3.7790181636810303, + "learning_rate": 1.3274587833996591e-05, + "loss": 4.9224, + "step": 13420 + }, + { + "epoch": 0.74212271973466, + "grad_norm": 3.3539559841156006, + "learning_rate": 1.3260375213189313e-05, + "loss": 4.5994, + "step": 13425 + }, + { + "epoch": 0.7423991155334438, + "grad_norm": 3.002809762954712, + "learning_rate": 1.3246162592382036e-05, + "loss": 5.2474, + "step": 13430 + }, + { + "epoch": 0.7426755113322278, + "grad_norm": 3.0804951190948486, + "learning_rate": 1.3231949971574758e-05, + "loss": 4.6779, + "step": 13435 + }, + { + "epoch": 0.7429519071310116, + "grad_norm": 4.401519775390625, + "learning_rate": 1.3217737350767482e-05, + "loss": 4.8666, + "step": 13440 + }, + { + "epoch": 0.7432283029297955, + "grad_norm": 3.0811357498168945, + "learning_rate": 1.3203524729960206e-05, + "loss": 4.909, + "step": 13445 + }, + { + "epoch": 0.7435046987285793, + "grad_norm": 4.676823139190674, + "learning_rate": 1.318931210915293e-05, + "loss": 5.3051, + "step": 13450 + }, + { + "epoch": 0.7437810945273632, + "grad_norm": 3.572274684906006, + "learning_rate": 1.3175099488345652e-05, + "loss": 4.8484, + "step": 13455 + }, + { + "epoch": 0.7440574903261471, + "grad_norm": 3.933598041534424, + "learning_rate": 1.3160886867538375e-05, + "loss": 4.9078, + "step": 13460 + }, + { + "epoch": 0.7443338861249309, + "grad_norm": 4.784248352050781, + "learning_rate": 1.3146674246731097e-05, + "loss": 4.9795, + "step": 13465 + }, + { + "epoch": 0.7446102819237148, + "grad_norm": 3.54833984375, + "learning_rate": 1.3132461625923819e-05, + "loss": 5.0677, + "step": 13470 + }, + { + "epoch": 0.7448866777224986, + "grad_norm": 3.983644962310791, + "learning_rate": 1.3118249005116545e-05, + "loss": 4.8627, + "step": 13475 + }, + { + "epoch": 0.7451630735212825, + "grad_norm": 3.837332010269165, + "learning_rate": 1.3104036384309267e-05, + "loss": 4.7729, + "step": 13480 + }, + { + "epoch": 0.7454394693200663, + "grad_norm": 4.445468425750732, + "learning_rate": 1.3089823763501991e-05, + "loss": 5.0954, + "step": 13485 + }, + { + "epoch": 0.7457158651188502, + "grad_norm": 3.518786668777466, + "learning_rate": 1.3075611142694713e-05, + "loss": 4.855, + "step": 13490 + }, + { + "epoch": 0.745992260917634, + "grad_norm": 3.9900107383728027, + "learning_rate": 1.3061398521887436e-05, + "loss": 4.7721, + "step": 13495 + }, + { + "epoch": 0.746268656716418, + "grad_norm": 3.3050553798675537, + "learning_rate": 1.3047185901080161e-05, + "loss": 4.9431, + "step": 13500 + }, + { + "epoch": 0.7465450525152018, + "grad_norm": 3.213809013366699, + "learning_rate": 1.3032973280272884e-05, + "loss": 5.172, + "step": 13505 + }, + { + "epoch": 0.7468214483139857, + "grad_norm": 3.7360849380493164, + "learning_rate": 1.3018760659465606e-05, + "loss": 5.0361, + "step": 13510 + }, + { + "epoch": 0.7470978441127695, + "grad_norm": 4.024673938751221, + "learning_rate": 1.300454803865833e-05, + "loss": 4.8585, + "step": 13515 + }, + { + "epoch": 0.7473742399115534, + "grad_norm": 3.3230862617492676, + "learning_rate": 1.2990335417851052e-05, + "loss": 5.1286, + "step": 13520 + }, + { + "epoch": 0.7476506357103372, + "grad_norm": 4.014267921447754, + "learning_rate": 1.2976122797043774e-05, + "loss": 5.0204, + "step": 13525 + }, + { + "epoch": 0.7479270315091211, + "grad_norm": 3.804960250854492, + "learning_rate": 1.29619101762365e-05, + "loss": 4.6567, + "step": 13530 + }, + { + "epoch": 0.7482034273079049, + "grad_norm": 3.149284601211548, + "learning_rate": 1.2947697555429222e-05, + "loss": 4.857, + "step": 13535 + }, + { + "epoch": 0.7484798231066888, + "grad_norm": 3.5097615718841553, + "learning_rate": 1.2933484934621945e-05, + "loss": 4.5143, + "step": 13540 + }, + { + "epoch": 0.7487562189054726, + "grad_norm": 3.3128249645233154, + "learning_rate": 1.2919272313814667e-05, + "loss": 5.1963, + "step": 13545 + }, + { + "epoch": 0.7490326147042565, + "grad_norm": 3.3906478881835938, + "learning_rate": 1.2905059693007391e-05, + "loss": 4.741, + "step": 13550 + }, + { + "epoch": 0.7493090105030403, + "grad_norm": 3.369476556777954, + "learning_rate": 1.2890847072200115e-05, + "loss": 5.0067, + "step": 13555 + }, + { + "epoch": 0.7495854063018242, + "grad_norm": 4.018293857574463, + "learning_rate": 1.2876634451392839e-05, + "loss": 5.2337, + "step": 13560 + }, + { + "epoch": 0.749861802100608, + "grad_norm": 4.163745880126953, + "learning_rate": 1.2862421830585561e-05, + "loss": 4.9956, + "step": 13565 + }, + { + "epoch": 0.750138197899392, + "grad_norm": 3.6494219303131104, + "learning_rate": 1.2848209209778283e-05, + "loss": 4.6203, + "step": 13570 + }, + { + "epoch": 0.7504145936981758, + "grad_norm": 3.210977077484131, + "learning_rate": 1.2833996588971006e-05, + "loss": 4.9079, + "step": 13575 + }, + { + "epoch": 0.7506909894969597, + "grad_norm": 3.0875606536865234, + "learning_rate": 1.2819783968163728e-05, + "loss": 5.0435, + "step": 13580 + }, + { + "epoch": 0.7509673852957435, + "grad_norm": 3.371025323867798, + "learning_rate": 1.2805571347356454e-05, + "loss": 5.0249, + "step": 13585 + }, + { + "epoch": 0.7512437810945274, + "grad_norm": 3.431417226791382, + "learning_rate": 1.2791358726549178e-05, + "loss": 4.8918, + "step": 13590 + }, + { + "epoch": 0.7515201768933112, + "grad_norm": 4.608222007751465, + "learning_rate": 1.27771461057419e-05, + "loss": 4.8191, + "step": 13595 + }, + { + "epoch": 0.7517965726920951, + "grad_norm": 4.438573360443115, + "learning_rate": 1.2762933484934622e-05, + "loss": 5.1095, + "step": 13600 + }, + { + "epoch": 0.7520729684908789, + "grad_norm": 3.5704104900360107, + "learning_rate": 1.2748720864127345e-05, + "loss": 4.8193, + "step": 13605 + }, + { + "epoch": 0.7523493642896628, + "grad_norm": 3.665001630783081, + "learning_rate": 1.273450824332007e-05, + "loss": 4.9377, + "step": 13610 + }, + { + "epoch": 0.7526257600884466, + "grad_norm": 4.091150760650635, + "learning_rate": 1.2720295622512792e-05, + "loss": 4.758, + "step": 13615 + }, + { + "epoch": 0.7529021558872305, + "grad_norm": 3.687591791152954, + "learning_rate": 1.2706083001705515e-05, + "loss": 4.9703, + "step": 13620 + }, + { + "epoch": 0.7531785516860143, + "grad_norm": 3.9441850185394287, + "learning_rate": 1.2691870380898239e-05, + "loss": 5.0439, + "step": 13625 + }, + { + "epoch": 0.7534549474847982, + "grad_norm": 3.240487575531006, + "learning_rate": 1.2677657760090961e-05, + "loss": 4.6799, + "step": 13630 + }, + { + "epoch": 0.753731343283582, + "grad_norm": 3.4183409214019775, + "learning_rate": 1.2663445139283683e-05, + "loss": 5.1698, + "step": 13635 + }, + { + "epoch": 0.754007739082366, + "grad_norm": 3.549414873123169, + "learning_rate": 1.2649232518476409e-05, + "loss": 4.9135, + "step": 13640 + }, + { + "epoch": 0.7542841348811498, + "grad_norm": 3.5168375968933105, + "learning_rate": 1.2635019897669131e-05, + "loss": 5.078, + "step": 13645 + }, + { + "epoch": 0.7545605306799337, + "grad_norm": 2.9269330501556396, + "learning_rate": 1.2620807276861854e-05, + "loss": 4.9437, + "step": 13650 + }, + { + "epoch": 0.7548369264787175, + "grad_norm": 3.613555908203125, + "learning_rate": 1.2606594656054576e-05, + "loss": 5.1616, + "step": 13655 + }, + { + "epoch": 0.7551133222775014, + "grad_norm": 3.630427122116089, + "learning_rate": 1.25923820352473e-05, + "loss": 5.0425, + "step": 13660 + }, + { + "epoch": 0.7553897180762852, + "grad_norm": 3.5098865032196045, + "learning_rate": 1.2578169414440025e-05, + "loss": 4.9677, + "step": 13665 + }, + { + "epoch": 0.7556661138750691, + "grad_norm": 3.6350200176239014, + "learning_rate": 1.2563956793632748e-05, + "loss": 5.1123, + "step": 13670 + }, + { + "epoch": 0.7559425096738529, + "grad_norm": 3.2419891357421875, + "learning_rate": 1.254974417282547e-05, + "loss": 5.0052, + "step": 13675 + }, + { + "epoch": 0.7562189054726368, + "grad_norm": 3.6156861782073975, + "learning_rate": 1.2535531552018192e-05, + "loss": 4.6917, + "step": 13680 + }, + { + "epoch": 0.7564953012714207, + "grad_norm": 3.5388574600219727, + "learning_rate": 1.2521318931210915e-05, + "loss": 4.9765, + "step": 13685 + }, + { + "epoch": 0.7567716970702045, + "grad_norm": 3.379070997238159, + "learning_rate": 1.2507106310403639e-05, + "loss": 4.8777, + "step": 13690 + }, + { + "epoch": 0.7570480928689884, + "grad_norm": 3.054107904434204, + "learning_rate": 1.2492893689596363e-05, + "loss": 4.6504, + "step": 13695 + }, + { + "epoch": 0.7573244886677722, + "grad_norm": 4.416358947753906, + "learning_rate": 1.2478681068789087e-05, + "loss": 4.9261, + "step": 13700 + }, + { + "epoch": 0.7576008844665562, + "grad_norm": 3.9947686195373535, + "learning_rate": 1.2464468447981809e-05, + "loss": 5.0292, + "step": 13705 + }, + { + "epoch": 0.75787728026534, + "grad_norm": 3.1346933841705322, + "learning_rate": 1.2450255827174531e-05, + "loss": 4.9894, + "step": 13710 + }, + { + "epoch": 0.7581536760641239, + "grad_norm": 3.530553102493286, + "learning_rate": 1.2436043206367255e-05, + "loss": 5.091, + "step": 13715 + }, + { + "epoch": 0.7584300718629077, + "grad_norm": 3.7621891498565674, + "learning_rate": 1.2421830585559977e-05, + "loss": 4.74, + "step": 13720 + }, + { + "epoch": 0.7587064676616916, + "grad_norm": 3.6707799434661865, + "learning_rate": 1.24076179647527e-05, + "loss": 5.1945, + "step": 13725 + }, + { + "epoch": 0.7589828634604754, + "grad_norm": 3.413144111633301, + "learning_rate": 1.2393405343945425e-05, + "loss": 4.9481, + "step": 13730 + }, + { + "epoch": 0.7592592592592593, + "grad_norm": 3.7922909259796143, + "learning_rate": 1.2379192723138148e-05, + "loss": 5.0266, + "step": 13735 + }, + { + "epoch": 0.7595356550580431, + "grad_norm": 3.63356876373291, + "learning_rate": 1.236498010233087e-05, + "loss": 5.0376, + "step": 13740 + }, + { + "epoch": 0.759812050856827, + "grad_norm": 3.9122843742370605, + "learning_rate": 1.2350767481523594e-05, + "loss": 4.7132, + "step": 13745 + }, + { + "epoch": 0.7600884466556108, + "grad_norm": 4.200931072235107, + "learning_rate": 1.2336554860716316e-05, + "loss": 4.8281, + "step": 13750 + }, + { + "epoch": 0.7603648424543947, + "grad_norm": 3.4712514877319336, + "learning_rate": 1.232234223990904e-05, + "loss": 4.9505, + "step": 13755 + }, + { + "epoch": 0.7606412382531785, + "grad_norm": 3.5167579650878906, + "learning_rate": 1.2308129619101762e-05, + "loss": 4.8998, + "step": 13760 + }, + { + "epoch": 0.7609176340519624, + "grad_norm": 2.8712246417999268, + "learning_rate": 1.2293916998294486e-05, + "loss": 4.9693, + "step": 13765 + }, + { + "epoch": 0.7611940298507462, + "grad_norm": 3.5918309688568115, + "learning_rate": 1.227970437748721e-05, + "loss": 5.3341, + "step": 13770 + }, + { + "epoch": 0.7614704256495302, + "grad_norm": 3.4594647884368896, + "learning_rate": 1.2265491756679933e-05, + "loss": 4.7239, + "step": 13775 + }, + { + "epoch": 0.761746821448314, + "grad_norm": 3.583055257797241, + "learning_rate": 1.2251279135872655e-05, + "loss": 4.8022, + "step": 13780 + }, + { + "epoch": 0.7620232172470979, + "grad_norm": 3.9287872314453125, + "learning_rate": 1.2237066515065379e-05, + "loss": 4.6748, + "step": 13785 + }, + { + "epoch": 0.7622996130458817, + "grad_norm": 4.147637367248535, + "learning_rate": 1.2222853894258101e-05, + "loss": 4.9476, + "step": 13790 + }, + { + "epoch": 0.7625760088446656, + "grad_norm": 3.5439789295196533, + "learning_rate": 1.2208641273450824e-05, + "loss": 4.9426, + "step": 13795 + }, + { + "epoch": 0.7628524046434494, + "grad_norm": 3.4278995990753174, + "learning_rate": 1.2194428652643548e-05, + "loss": 5.0638, + "step": 13800 + }, + { + "epoch": 0.7631288004422333, + "grad_norm": 3.199315071105957, + "learning_rate": 1.2180216031836271e-05, + "loss": 4.9484, + "step": 13805 + }, + { + "epoch": 0.7634051962410171, + "grad_norm": 3.8618929386138916, + "learning_rate": 1.2166003411028995e-05, + "loss": 4.7998, + "step": 13810 + }, + { + "epoch": 0.763681592039801, + "grad_norm": 3.2143681049346924, + "learning_rate": 1.2151790790221718e-05, + "loss": 4.9469, + "step": 13815 + }, + { + "epoch": 0.7639579878385848, + "grad_norm": 3.6355247497558594, + "learning_rate": 1.213757816941444e-05, + "loss": 4.7807, + "step": 13820 + }, + { + "epoch": 0.7642343836373687, + "grad_norm": 4.1098246574401855, + "learning_rate": 1.2123365548607164e-05, + "loss": 4.9628, + "step": 13825 + }, + { + "epoch": 0.7645107794361525, + "grad_norm": 3.3956034183502197, + "learning_rate": 1.2109152927799886e-05, + "loss": 5.1022, + "step": 13830 + }, + { + "epoch": 0.7647871752349364, + "grad_norm": 3.51596736907959, + "learning_rate": 1.209494030699261e-05, + "loss": 4.7867, + "step": 13835 + }, + { + "epoch": 0.7650635710337202, + "grad_norm": 3.4072954654693604, + "learning_rate": 1.2080727686185334e-05, + "loss": 5.0985, + "step": 13840 + }, + { + "epoch": 0.7653399668325042, + "grad_norm": 2.994511842727661, + "learning_rate": 1.2066515065378057e-05, + "loss": 4.9816, + "step": 13845 + }, + { + "epoch": 0.765616362631288, + "grad_norm": 3.5091991424560547, + "learning_rate": 1.2052302444570779e-05, + "loss": 5.0666, + "step": 13850 + }, + { + "epoch": 0.7658927584300719, + "grad_norm": 3.4935755729675293, + "learning_rate": 1.2038089823763503e-05, + "loss": 4.8671, + "step": 13855 + }, + { + "epoch": 0.7661691542288557, + "grad_norm": 3.766550302505493, + "learning_rate": 1.2023877202956225e-05, + "loss": 4.9597, + "step": 13860 + }, + { + "epoch": 0.7664455500276396, + "grad_norm": 3.4647650718688965, + "learning_rate": 1.2009664582148949e-05, + "loss": 5.0242, + "step": 13865 + }, + { + "epoch": 0.7667219458264234, + "grad_norm": 3.2583017349243164, + "learning_rate": 1.1995451961341671e-05, + "loss": 5.1654, + "step": 13870 + }, + { + "epoch": 0.7669983416252073, + "grad_norm": 4.087442874908447, + "learning_rate": 1.1981239340534395e-05, + "loss": 5.029, + "step": 13875 + }, + { + "epoch": 0.7672747374239911, + "grad_norm": 3.532439947128296, + "learning_rate": 1.196702671972712e-05, + "loss": 4.7802, + "step": 13880 + }, + { + "epoch": 0.767551133222775, + "grad_norm": 2.9457197189331055, + "learning_rate": 1.1952814098919842e-05, + "loss": 4.8096, + "step": 13885 + }, + { + "epoch": 0.7678275290215588, + "grad_norm": 3.6090781688690186, + "learning_rate": 1.1938601478112564e-05, + "loss": 4.9288, + "step": 13890 + }, + { + "epoch": 0.7681039248203427, + "grad_norm": 4.044902801513672, + "learning_rate": 1.1924388857305288e-05, + "loss": 4.8477, + "step": 13895 + }, + { + "epoch": 0.7683803206191266, + "grad_norm": 4.630081653594971, + "learning_rate": 1.191017623649801e-05, + "loss": 4.9998, + "step": 13900 + }, + { + "epoch": 0.7686567164179104, + "grad_norm": 4.869181156158447, + "learning_rate": 1.1895963615690734e-05, + "loss": 4.8786, + "step": 13905 + }, + { + "epoch": 0.7689331122166944, + "grad_norm": 3.5769691467285156, + "learning_rate": 1.1881750994883458e-05, + "loss": 4.9105, + "step": 13910 + }, + { + "epoch": 0.7692095080154782, + "grad_norm": 2.8430140018463135, + "learning_rate": 1.186753837407618e-05, + "loss": 4.9828, + "step": 13915 + }, + { + "epoch": 0.7694859038142621, + "grad_norm": 3.4973304271698, + "learning_rate": 1.1853325753268904e-05, + "loss": 4.7579, + "step": 13920 + }, + { + "epoch": 0.7697622996130459, + "grad_norm": 3.7342896461486816, + "learning_rate": 1.1839113132461627e-05, + "loss": 5.0691, + "step": 13925 + }, + { + "epoch": 0.7700386954118298, + "grad_norm": 3.4098775386810303, + "learning_rate": 1.1824900511654349e-05, + "loss": 4.8514, + "step": 13930 + }, + { + "epoch": 0.7703150912106136, + "grad_norm": 3.3334388732910156, + "learning_rate": 1.1810687890847073e-05, + "loss": 5.1646, + "step": 13935 + }, + { + "epoch": 0.7705914870093975, + "grad_norm": 4.081144332885742, + "learning_rate": 1.1796475270039795e-05, + "loss": 5.1261, + "step": 13940 + }, + { + "epoch": 0.7708678828081813, + "grad_norm": 3.5878841876983643, + "learning_rate": 1.178226264923252e-05, + "loss": 4.9911, + "step": 13945 + }, + { + "epoch": 0.7711442786069652, + "grad_norm": 3.167708158493042, + "learning_rate": 1.1768050028425243e-05, + "loss": 4.8064, + "step": 13950 + }, + { + "epoch": 0.771420674405749, + "grad_norm": 4.6932806968688965, + "learning_rate": 1.1753837407617965e-05, + "loss": 5.0513, + "step": 13955 + }, + { + "epoch": 0.7716970702045329, + "grad_norm": 3.659235954284668, + "learning_rate": 1.1739624786810688e-05, + "loss": 4.88, + "step": 13960 + }, + { + "epoch": 0.7719734660033167, + "grad_norm": 3.2741990089416504, + "learning_rate": 1.1725412166003412e-05, + "loss": 4.7233, + "step": 13965 + }, + { + "epoch": 0.7722498618021006, + "grad_norm": 3.8330390453338623, + "learning_rate": 1.1711199545196134e-05, + "loss": 4.9704, + "step": 13970 + }, + { + "epoch": 0.7725262576008844, + "grad_norm": 3.3870649337768555, + "learning_rate": 1.1696986924388858e-05, + "loss": 4.7279, + "step": 13975 + }, + { + "epoch": 0.7728026533996684, + "grad_norm": 3.542768716812134, + "learning_rate": 1.1682774303581582e-05, + "loss": 5.0573, + "step": 13980 + }, + { + "epoch": 0.7730790491984522, + "grad_norm": 4.090266227722168, + "learning_rate": 1.1668561682774304e-05, + "loss": 5.2034, + "step": 13985 + }, + { + "epoch": 0.7733554449972361, + "grad_norm": 3.7428195476531982, + "learning_rate": 1.1654349061967028e-05, + "loss": 5.0463, + "step": 13990 + }, + { + "epoch": 0.7736318407960199, + "grad_norm": 4.728503227233887, + "learning_rate": 1.164013644115975e-05, + "loss": 4.7835, + "step": 13995 + }, + { + "epoch": 0.7739082365948038, + "grad_norm": 3.689528226852417, + "learning_rate": 1.1625923820352473e-05, + "loss": 5.2645, + "step": 14000 + }, + { + "epoch": 0.7741846323935876, + "grad_norm": 4.224091529846191, + "learning_rate": 1.1611711199545197e-05, + "loss": 4.7057, + "step": 14005 + }, + { + "epoch": 0.7744610281923715, + "grad_norm": 3.54939866065979, + "learning_rate": 1.1597498578737919e-05, + "loss": 5.2695, + "step": 14010 + }, + { + "epoch": 0.7747374239911553, + "grad_norm": 3.281078338623047, + "learning_rate": 1.1583285957930643e-05, + "loss": 5.0044, + "step": 14015 + }, + { + "epoch": 0.7750138197899392, + "grad_norm": 5.049131870269775, + "learning_rate": 1.1569073337123367e-05, + "loss": 5.2212, + "step": 14020 + }, + { + "epoch": 0.775290215588723, + "grad_norm": 3.362013339996338, + "learning_rate": 1.155486071631609e-05, + "loss": 4.8976, + "step": 14025 + }, + { + "epoch": 0.7755666113875069, + "grad_norm": 5.27224063873291, + "learning_rate": 1.1540648095508812e-05, + "loss": 4.898, + "step": 14030 + }, + { + "epoch": 0.7758430071862907, + "grad_norm": 3.802133560180664, + "learning_rate": 1.1526435474701536e-05, + "loss": 4.8342, + "step": 14035 + }, + { + "epoch": 0.7761194029850746, + "grad_norm": 3.688088893890381, + "learning_rate": 1.1512222853894258e-05, + "loss": 4.8441, + "step": 14040 + }, + { + "epoch": 0.7763957987838584, + "grad_norm": 3.9619669914245605, + "learning_rate": 1.1498010233086982e-05, + "loss": 5.2455, + "step": 14045 + }, + { + "epoch": 0.7766721945826424, + "grad_norm": 3.759188413619995, + "learning_rate": 1.1483797612279706e-05, + "loss": 5.1307, + "step": 14050 + }, + { + "epoch": 0.7769485903814262, + "grad_norm": 3.8243587017059326, + "learning_rate": 1.1469584991472428e-05, + "loss": 4.9645, + "step": 14055 + }, + { + "epoch": 0.7772249861802101, + "grad_norm": 3.4312751293182373, + "learning_rate": 1.1455372370665152e-05, + "loss": 4.9075, + "step": 14060 + }, + { + "epoch": 0.7775013819789939, + "grad_norm": 3.4000275135040283, + "learning_rate": 1.1441159749857874e-05, + "loss": 4.8989, + "step": 14065 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 3.78006911277771, + "learning_rate": 1.1426947129050597e-05, + "loss": 5.2333, + "step": 14070 + }, + { + "epoch": 0.7780541735765616, + "grad_norm": 3.4480984210968018, + "learning_rate": 1.141273450824332e-05, + "loss": 4.9613, + "step": 14075 + }, + { + "epoch": 0.7783305693753455, + "grad_norm": 3.377744674682617, + "learning_rate": 1.1398521887436043e-05, + "loss": 5.0646, + "step": 14080 + }, + { + "epoch": 0.7786069651741293, + "grad_norm": 3.1980888843536377, + "learning_rate": 1.1384309266628767e-05, + "loss": 4.9803, + "step": 14085 + }, + { + "epoch": 0.7788833609729132, + "grad_norm": 3.6203091144561768, + "learning_rate": 1.1370096645821491e-05, + "loss": 4.8729, + "step": 14090 + }, + { + "epoch": 0.779159756771697, + "grad_norm": 3.161928176879883, + "learning_rate": 1.1355884025014213e-05, + "loss": 5.1262, + "step": 14095 + }, + { + "epoch": 0.7794361525704809, + "grad_norm": 3.8327505588531494, + "learning_rate": 1.1341671404206937e-05, + "loss": 5.0207, + "step": 14100 + }, + { + "epoch": 0.7797125483692647, + "grad_norm": 3.690469980239868, + "learning_rate": 1.132745878339966e-05, + "loss": 5.0829, + "step": 14105 + }, + { + "epoch": 0.7799889441680486, + "grad_norm": 3.06017804145813, + "learning_rate": 1.1313246162592382e-05, + "loss": 5.2518, + "step": 14110 + }, + { + "epoch": 0.7802653399668324, + "grad_norm": 4.095527172088623, + "learning_rate": 1.1299033541785106e-05, + "loss": 5.1885, + "step": 14115 + }, + { + "epoch": 0.7805417357656164, + "grad_norm": 3.2654521465301514, + "learning_rate": 1.128482092097783e-05, + "loss": 4.9736, + "step": 14120 + }, + { + "epoch": 0.7808181315644003, + "grad_norm": 3.9866836071014404, + "learning_rate": 1.1270608300170552e-05, + "loss": 5.0144, + "step": 14125 + }, + { + "epoch": 0.7810945273631841, + "grad_norm": 4.128415584564209, + "learning_rate": 1.1256395679363276e-05, + "loss": 5.0449, + "step": 14130 + }, + { + "epoch": 0.781370923161968, + "grad_norm": 3.3916735649108887, + "learning_rate": 1.1242183058555998e-05, + "loss": 5.0108, + "step": 14135 + }, + { + "epoch": 0.7816473189607518, + "grad_norm": 3.543978452682495, + "learning_rate": 1.122797043774872e-05, + "loss": 4.8539, + "step": 14140 + }, + { + "epoch": 0.7819237147595357, + "grad_norm": 3.8419699668884277, + "learning_rate": 1.1213757816941444e-05, + "loss": 4.722, + "step": 14145 + }, + { + "epoch": 0.7822001105583195, + "grad_norm": 3.7718849182128906, + "learning_rate": 1.1199545196134167e-05, + "loss": 4.9846, + "step": 14150 + }, + { + "epoch": 0.7824765063571034, + "grad_norm": 3.22829532623291, + "learning_rate": 1.118533257532689e-05, + "loss": 4.6035, + "step": 14155 + }, + { + "epoch": 0.7827529021558872, + "grad_norm": 3.4191575050354004, + "learning_rate": 1.1171119954519615e-05, + "loss": 5.0507, + "step": 14160 + }, + { + "epoch": 0.7830292979546711, + "grad_norm": 4.302538871765137, + "learning_rate": 1.1156907333712337e-05, + "loss": 4.8843, + "step": 14165 + }, + { + "epoch": 0.7833056937534549, + "grad_norm": 4.03009557723999, + "learning_rate": 1.1142694712905061e-05, + "loss": 4.8532, + "step": 14170 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 3.964759349822998, + "learning_rate": 1.1128482092097783e-05, + "loss": 5.3011, + "step": 14175 + }, + { + "epoch": 0.7838584853510226, + "grad_norm": 3.5552785396575928, + "learning_rate": 1.1114269471290506e-05, + "loss": 5.0725, + "step": 14180 + }, + { + "epoch": 0.7841348811498066, + "grad_norm": 3.765488862991333, + "learning_rate": 1.110005685048323e-05, + "loss": 4.8646, + "step": 14185 + }, + { + "epoch": 0.7844112769485904, + "grad_norm": 3.883822441101074, + "learning_rate": 1.1085844229675952e-05, + "loss": 4.9223, + "step": 14190 + }, + { + "epoch": 0.7846876727473743, + "grad_norm": 3.70638370513916, + "learning_rate": 1.1071631608868676e-05, + "loss": 5.1099, + "step": 14195 + }, + { + "epoch": 0.7849640685461581, + "grad_norm": 4.200986385345459, + "learning_rate": 1.10574189880614e-05, + "loss": 4.722, + "step": 14200 + }, + { + "epoch": 0.785240464344942, + "grad_norm": 3.2610623836517334, + "learning_rate": 1.1043206367254122e-05, + "loss": 4.8481, + "step": 14205 + }, + { + "epoch": 0.7855168601437258, + "grad_norm": 3.607994556427002, + "learning_rate": 1.1028993746446846e-05, + "loss": 4.6585, + "step": 14210 + }, + { + "epoch": 0.7857932559425097, + "grad_norm": 3.5993244647979736, + "learning_rate": 1.1014781125639568e-05, + "loss": 4.8042, + "step": 14215 + }, + { + "epoch": 0.7860696517412935, + "grad_norm": 3.5972378253936768, + "learning_rate": 1.100056850483229e-05, + "loss": 4.8298, + "step": 14220 + }, + { + "epoch": 0.7863460475400774, + "grad_norm": 3.5555312633514404, + "learning_rate": 1.0986355884025015e-05, + "loss": 5.0829, + "step": 14225 + }, + { + "epoch": 0.7866224433388612, + "grad_norm": 3.3311171531677246, + "learning_rate": 1.0972143263217739e-05, + "loss": 5.0186, + "step": 14230 + }, + { + "epoch": 0.7868988391376451, + "grad_norm": 3.552480936050415, + "learning_rate": 1.0957930642410461e-05, + "loss": 4.6854, + "step": 14235 + }, + { + "epoch": 0.7871752349364289, + "grad_norm": 3.2884206771850586, + "learning_rate": 1.0943718021603185e-05, + "loss": 5.0206, + "step": 14240 + }, + { + "epoch": 0.7874516307352128, + "grad_norm": 3.535804033279419, + "learning_rate": 1.0929505400795907e-05, + "loss": 4.8758, + "step": 14245 + }, + { + "epoch": 0.7877280265339967, + "grad_norm": 4.292071342468262, + "learning_rate": 1.091529277998863e-05, + "loss": 5.2122, + "step": 14250 + }, + { + "epoch": 0.7880044223327806, + "grad_norm": 3.89021372795105, + "learning_rate": 1.0901080159181353e-05, + "loss": 4.775, + "step": 14255 + }, + { + "epoch": 0.7882808181315644, + "grad_norm": 3.114368438720703, + "learning_rate": 1.0886867538374076e-05, + "loss": 5.1345, + "step": 14260 + }, + { + "epoch": 0.7885572139303483, + "grad_norm": 3.6420657634735107, + "learning_rate": 1.08726549175668e-05, + "loss": 4.7937, + "step": 14265 + }, + { + "epoch": 0.7888336097291321, + "grad_norm": 3.4523563385009766, + "learning_rate": 1.0858442296759524e-05, + "loss": 4.8285, + "step": 14270 + }, + { + "epoch": 0.789110005527916, + "grad_norm": 3.896808624267578, + "learning_rate": 1.0844229675952246e-05, + "loss": 5.1532, + "step": 14275 + }, + { + "epoch": 0.7893864013266998, + "grad_norm": 3.9850831031799316, + "learning_rate": 1.083001705514497e-05, + "loss": 5.117, + "step": 14280 + }, + { + "epoch": 0.7896627971254837, + "grad_norm": 3.7816109657287598, + "learning_rate": 1.0815804434337692e-05, + "loss": 4.8639, + "step": 14285 + }, + { + "epoch": 0.7899391929242675, + "grad_norm": 3.2434422969818115, + "learning_rate": 1.0801591813530414e-05, + "loss": 4.716, + "step": 14290 + }, + { + "epoch": 0.7902155887230514, + "grad_norm": 4.700549125671387, + "learning_rate": 1.0787379192723138e-05, + "loss": 5.1435, + "step": 14295 + }, + { + "epoch": 0.7904919845218352, + "grad_norm": 4.932585716247559, + "learning_rate": 1.0773166571915862e-05, + "loss": 5.0201, + "step": 14300 + }, + { + "epoch": 0.7907683803206191, + "grad_norm": 4.089287281036377, + "learning_rate": 1.0758953951108585e-05, + "loss": 4.8033, + "step": 14305 + }, + { + "epoch": 0.7910447761194029, + "grad_norm": 4.705197811126709, + "learning_rate": 1.0744741330301309e-05, + "loss": 4.6571, + "step": 14310 + }, + { + "epoch": 0.7913211719181868, + "grad_norm": 4.048484802246094, + "learning_rate": 1.0730528709494031e-05, + "loss": 4.8813, + "step": 14315 + }, + { + "epoch": 0.7915975677169707, + "grad_norm": 4.197638988494873, + "learning_rate": 1.0716316088686755e-05, + "loss": 4.991, + "step": 14320 + }, + { + "epoch": 0.7918739635157546, + "grad_norm": 3.081589698791504, + "learning_rate": 1.0702103467879477e-05, + "loss": 4.7785, + "step": 14325 + }, + { + "epoch": 0.7921503593145384, + "grad_norm": 3.1407415866851807, + "learning_rate": 1.06878908470722e-05, + "loss": 4.7891, + "step": 14330 + }, + { + "epoch": 0.7924267551133223, + "grad_norm": 3.6896438598632812, + "learning_rate": 1.0673678226264924e-05, + "loss": 4.7142, + "step": 14335 + }, + { + "epoch": 0.7927031509121062, + "grad_norm": 3.5066559314727783, + "learning_rate": 1.0659465605457647e-05, + "loss": 5.1038, + "step": 14340 + }, + { + "epoch": 0.79297954671089, + "grad_norm": 4.803554058074951, + "learning_rate": 1.064525298465037e-05, + "loss": 5.1086, + "step": 14345 + }, + { + "epoch": 0.7932559425096739, + "grad_norm": 3.1235287189483643, + "learning_rate": 1.0631040363843094e-05, + "loss": 4.952, + "step": 14350 + }, + { + "epoch": 0.7935323383084577, + "grad_norm": 4.96946382522583, + "learning_rate": 1.0616827743035816e-05, + "loss": 5.2588, + "step": 14355 + }, + { + "epoch": 0.7938087341072416, + "grad_norm": 3.8352441787719727, + "learning_rate": 1.0602615122228538e-05, + "loss": 4.871, + "step": 14360 + }, + { + "epoch": 0.7940851299060254, + "grad_norm": 3.559532642364502, + "learning_rate": 1.0588402501421262e-05, + "loss": 4.5453, + "step": 14365 + }, + { + "epoch": 0.7943615257048093, + "grad_norm": 4.492837905883789, + "learning_rate": 1.0574189880613986e-05, + "loss": 5.0764, + "step": 14370 + }, + { + "epoch": 0.7946379215035931, + "grad_norm": 3.7767648696899414, + "learning_rate": 1.055997725980671e-05, + "loss": 5.1033, + "step": 14375 + }, + { + "epoch": 0.794914317302377, + "grad_norm": 3.4074301719665527, + "learning_rate": 1.0545764638999433e-05, + "loss": 4.6381, + "step": 14380 + }, + { + "epoch": 0.7951907131011609, + "grad_norm": 4.1058783531188965, + "learning_rate": 1.0531552018192155e-05, + "loss": 4.7906, + "step": 14385 + }, + { + "epoch": 0.7954671088999448, + "grad_norm": 3.570598840713501, + "learning_rate": 1.0517339397384879e-05, + "loss": 4.7018, + "step": 14390 + }, + { + "epoch": 0.7957435046987286, + "grad_norm": 4.346248626708984, + "learning_rate": 1.0503126776577601e-05, + "loss": 5.2179, + "step": 14395 + }, + { + "epoch": 0.7960199004975125, + "grad_norm": 4.531290054321289, + "learning_rate": 1.0488914155770323e-05, + "loss": 5.124, + "step": 14400 + }, + { + "epoch": 0.7962962962962963, + "grad_norm": 4.11277961730957, + "learning_rate": 1.0474701534963047e-05, + "loss": 4.9454, + "step": 14405 + }, + { + "epoch": 0.7965726920950802, + "grad_norm": 4.518270015716553, + "learning_rate": 1.0460488914155771e-05, + "loss": 5.0708, + "step": 14410 + }, + { + "epoch": 0.796849087893864, + "grad_norm": 3.6835479736328125, + "learning_rate": 1.0446276293348494e-05, + "loss": 4.9844, + "step": 14415 + }, + { + "epoch": 0.7971254836926479, + "grad_norm": 4.1077880859375, + "learning_rate": 1.0432063672541218e-05, + "loss": 4.8259, + "step": 14420 + }, + { + "epoch": 0.7974018794914317, + "grad_norm": 3.0021631717681885, + "learning_rate": 1.041785105173394e-05, + "loss": 5.0534, + "step": 14425 + }, + { + "epoch": 0.7976782752902156, + "grad_norm": 3.630094289779663, + "learning_rate": 1.0403638430926662e-05, + "loss": 4.9099, + "step": 14430 + }, + { + "epoch": 0.7979546710889994, + "grad_norm": 3.7591092586517334, + "learning_rate": 1.0389425810119386e-05, + "loss": 5.1555, + "step": 14435 + }, + { + "epoch": 0.7982310668877833, + "grad_norm": 4.0143656730651855, + "learning_rate": 1.037521318931211e-05, + "loss": 4.811, + "step": 14440 + }, + { + "epoch": 0.7985074626865671, + "grad_norm": 4.8033976554870605, + "learning_rate": 1.0361000568504834e-05, + "loss": 5.0588, + "step": 14445 + }, + { + "epoch": 0.798783858485351, + "grad_norm": 4.348042964935303, + "learning_rate": 1.0346787947697556e-05, + "loss": 4.6481, + "step": 14450 + }, + { + "epoch": 0.7990602542841349, + "grad_norm": 3.549741506576538, + "learning_rate": 1.0332575326890279e-05, + "loss": 4.9424, + "step": 14455 + }, + { + "epoch": 0.7993366500829188, + "grad_norm": 3.3581111431121826, + "learning_rate": 1.0318362706083003e-05, + "loss": 5.1743, + "step": 14460 + }, + { + "epoch": 0.7996130458817026, + "grad_norm": 4.2459540367126465, + "learning_rate": 1.0304150085275725e-05, + "loss": 5.1007, + "step": 14465 + }, + { + "epoch": 0.7998894416804865, + "grad_norm": 5.176113128662109, + "learning_rate": 1.0289937464468447e-05, + "loss": 5.1488, + "step": 14470 + }, + { + "epoch": 0.8001658374792703, + "grad_norm": 3.7594547271728516, + "learning_rate": 1.0275724843661171e-05, + "loss": 5.0909, + "step": 14475 + }, + { + "epoch": 0.8004422332780542, + "grad_norm": 4.071329593658447, + "learning_rate": 1.0261512222853895e-05, + "loss": 5.1695, + "step": 14480 + }, + { + "epoch": 0.800718629076838, + "grad_norm": 2.798461437225342, + "learning_rate": 1.0247299602046617e-05, + "loss": 4.7519, + "step": 14485 + }, + { + "epoch": 0.8009950248756219, + "grad_norm": 3.2360336780548096, + "learning_rate": 1.0233086981239341e-05, + "loss": 5.04, + "step": 14490 + }, + { + "epoch": 0.8012714206744057, + "grad_norm": 3.919640064239502, + "learning_rate": 1.0218874360432064e-05, + "loss": 4.8624, + "step": 14495 + }, + { + "epoch": 0.8015478164731896, + "grad_norm": 3.5546929836273193, + "learning_rate": 1.0204661739624788e-05, + "loss": 4.8529, + "step": 14500 + }, + { + "epoch": 0.8018242122719734, + "grad_norm": 4.039145469665527, + "learning_rate": 1.019044911881751e-05, + "loss": 5.2288, + "step": 14505 + }, + { + "epoch": 0.8021006080707573, + "grad_norm": 2.8271827697753906, + "learning_rate": 1.0176236498010234e-05, + "loss": 4.7984, + "step": 14510 + }, + { + "epoch": 0.8023770038695411, + "grad_norm": 3.8265321254730225, + "learning_rate": 1.0162023877202958e-05, + "loss": 4.8457, + "step": 14515 + }, + { + "epoch": 0.802653399668325, + "grad_norm": 3.9370837211608887, + "learning_rate": 1.014781125639568e-05, + "loss": 5.279, + "step": 14520 + }, + { + "epoch": 0.8029297954671089, + "grad_norm": 3.1449708938598633, + "learning_rate": 1.0133598635588403e-05, + "loss": 4.7764, + "step": 14525 + }, + { + "epoch": 0.8032061912658928, + "grad_norm": 3.421999454498291, + "learning_rate": 1.0119386014781127e-05, + "loss": 4.887, + "step": 14530 + }, + { + "epoch": 0.8034825870646766, + "grad_norm": 3.242093324661255, + "learning_rate": 1.0105173393973849e-05, + "loss": 4.9132, + "step": 14535 + }, + { + "epoch": 0.8037589828634605, + "grad_norm": 3.8317198753356934, + "learning_rate": 1.0090960773166571e-05, + "loss": 4.9708, + "step": 14540 + }, + { + "epoch": 0.8040353786622443, + "grad_norm": 2.940936326980591, + "learning_rate": 1.0076748152359295e-05, + "loss": 5.1001, + "step": 14545 + }, + { + "epoch": 0.8043117744610282, + "grad_norm": 3.3959786891937256, + "learning_rate": 1.0062535531552019e-05, + "loss": 4.722, + "step": 14550 + }, + { + "epoch": 0.804588170259812, + "grad_norm": 3.077038049697876, + "learning_rate": 1.0048322910744743e-05, + "loss": 4.8792, + "step": 14555 + }, + { + "epoch": 0.8048645660585959, + "grad_norm": 3.9798264503479004, + "learning_rate": 1.0034110289937465e-05, + "loss": 4.864, + "step": 14560 + }, + { + "epoch": 0.8051409618573798, + "grad_norm": 3.65219783782959, + "learning_rate": 1.0019897669130188e-05, + "loss": 5.1996, + "step": 14565 + }, + { + "epoch": 0.8054173576561636, + "grad_norm": 3.0886595249176025, + "learning_rate": 1.0005685048322912e-05, + "loss": 4.9891, + "step": 14570 + }, + { + "epoch": 0.8056937534549475, + "grad_norm": 3.2638444900512695, + "learning_rate": 9.991472427515634e-06, + "loss": 4.4643, + "step": 14575 + }, + { + "epoch": 0.8059701492537313, + "grad_norm": 3.3993797302246094, + "learning_rate": 9.977259806708356e-06, + "loss": 4.5518, + "step": 14580 + }, + { + "epoch": 0.8062465450525153, + "grad_norm": 3.588103771209717, + "learning_rate": 9.963047185901082e-06, + "loss": 4.8706, + "step": 14585 + }, + { + "epoch": 0.806522940851299, + "grad_norm": 3.355370044708252, + "learning_rate": 9.948834565093804e-06, + "loss": 5.0936, + "step": 14590 + }, + { + "epoch": 0.806799336650083, + "grad_norm": 3.478992462158203, + "learning_rate": 9.934621944286526e-06, + "loss": 4.8008, + "step": 14595 + }, + { + "epoch": 0.8070757324488668, + "grad_norm": 3.620059013366699, + "learning_rate": 9.92040932347925e-06, + "loss": 4.8624, + "step": 14600 + }, + { + "epoch": 0.8073521282476507, + "grad_norm": 3.095531463623047, + "learning_rate": 9.906196702671973e-06, + "loss": 4.7169, + "step": 14605 + }, + { + "epoch": 0.8076285240464345, + "grad_norm": 3.2527894973754883, + "learning_rate": 9.891984081864697e-06, + "loss": 4.9315, + "step": 14610 + }, + { + "epoch": 0.8079049198452184, + "grad_norm": 3.8432343006134033, + "learning_rate": 9.877771461057419e-06, + "loss": 5.0862, + "step": 14615 + }, + { + "epoch": 0.8081813156440022, + "grad_norm": 4.290609836578369, + "learning_rate": 9.863558840250143e-06, + "loss": 5.1276, + "step": 14620 + }, + { + "epoch": 0.8084577114427861, + "grad_norm": 3.555579662322998, + "learning_rate": 9.849346219442867e-06, + "loss": 5.0188, + "step": 14625 + }, + { + "epoch": 0.8087341072415699, + "grad_norm": 3.1154568195343018, + "learning_rate": 9.835133598635589e-06, + "loss": 5.083, + "step": 14630 + }, + { + "epoch": 0.8090105030403538, + "grad_norm": 3.5717074871063232, + "learning_rate": 9.820920977828311e-06, + "loss": 5.0745, + "step": 14635 + }, + { + "epoch": 0.8092868988391376, + "grad_norm": 3.1403820514678955, + "learning_rate": 9.806708357021035e-06, + "loss": 4.8718, + "step": 14640 + }, + { + "epoch": 0.8095632946379215, + "grad_norm": 3.6060197353363037, + "learning_rate": 9.792495736213758e-06, + "loss": 5.1306, + "step": 14645 + }, + { + "epoch": 0.8098396904367053, + "grad_norm": 3.7077431678771973, + "learning_rate": 9.77828311540648e-06, + "loss": 4.6169, + "step": 14650 + }, + { + "epoch": 0.8101160862354893, + "grad_norm": 4.237201690673828, + "learning_rate": 9.764070494599204e-06, + "loss": 4.6571, + "step": 14655 + }, + { + "epoch": 0.810392482034273, + "grad_norm": 3.263676166534424, + "learning_rate": 9.749857873791928e-06, + "loss": 5.0076, + "step": 14660 + }, + { + "epoch": 0.810668877833057, + "grad_norm": 3.406616687774658, + "learning_rate": 9.735645252984652e-06, + "loss": 4.6793, + "step": 14665 + }, + { + "epoch": 0.8109452736318408, + "grad_norm": 3.664701461791992, + "learning_rate": 9.721432632177374e-06, + "loss": 4.7769, + "step": 14670 + }, + { + "epoch": 0.8112216694306247, + "grad_norm": 3.6615958213806152, + "learning_rate": 9.707220011370096e-06, + "loss": 5.1376, + "step": 14675 + }, + { + "epoch": 0.8114980652294085, + "grad_norm": 3.585599660873413, + "learning_rate": 9.69300739056282e-06, + "loss": 4.8062, + "step": 14680 + }, + { + "epoch": 0.8117744610281924, + "grad_norm": 3.3951401710510254, + "learning_rate": 9.678794769755543e-06, + "loss": 5.0233, + "step": 14685 + }, + { + "epoch": 0.8120508568269762, + "grad_norm": 3.261725664138794, + "learning_rate": 9.664582148948267e-06, + "loss": 4.957, + "step": 14690 + }, + { + "epoch": 0.8123272526257601, + "grad_norm": 3.23929762840271, + "learning_rate": 9.65036952814099e-06, + "loss": 4.885, + "step": 14695 + }, + { + "epoch": 0.8126036484245439, + "grad_norm": 3.207082748413086, + "learning_rate": 9.636156907333713e-06, + "loss": 4.5523, + "step": 14700 + }, + { + "epoch": 0.8128800442233278, + "grad_norm": 3.7475905418395996, + "learning_rate": 9.621944286526435e-06, + "loss": 5.0544, + "step": 14705 + }, + { + "epoch": 0.8131564400221116, + "grad_norm": 3.0875489711761475, + "learning_rate": 9.60773166571916e-06, + "loss": 5.127, + "step": 14710 + }, + { + "epoch": 0.8134328358208955, + "grad_norm": 3.5430004596710205, + "learning_rate": 9.593519044911882e-06, + "loss": 4.9775, + "step": 14715 + }, + { + "epoch": 0.8137092316196793, + "grad_norm": 3.482557535171509, + "learning_rate": 9.579306424104606e-06, + "loss": 4.8944, + "step": 14720 + }, + { + "epoch": 0.8139856274184633, + "grad_norm": 3.779650926589966, + "learning_rate": 9.565093803297328e-06, + "loss": 5.2317, + "step": 14725 + }, + { + "epoch": 0.814262023217247, + "grad_norm": 4.0570244789123535, + "learning_rate": 9.550881182490052e-06, + "loss": 4.9351, + "step": 14730 + }, + { + "epoch": 0.814538419016031, + "grad_norm": 3.294402599334717, + "learning_rate": 9.536668561682776e-06, + "loss": 4.63, + "step": 14735 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 3.926581859588623, + "learning_rate": 9.522455940875498e-06, + "loss": 4.9833, + "step": 14740 + }, + { + "epoch": 0.8150912106135987, + "grad_norm": 3.623554229736328, + "learning_rate": 9.50824332006822e-06, + "loss": 4.9796, + "step": 14745 + }, + { + "epoch": 0.8153676064123825, + "grad_norm": 4.15550422668457, + "learning_rate": 9.494030699260944e-06, + "loss": 4.9854, + "step": 14750 + }, + { + "epoch": 0.8156440022111664, + "grad_norm": 4.647388935089111, + "learning_rate": 9.479818078453667e-06, + "loss": 5.336, + "step": 14755 + }, + { + "epoch": 0.8159203980099502, + "grad_norm": 4.139171600341797, + "learning_rate": 9.46560545764639e-06, + "loss": 4.9827, + "step": 14760 + }, + { + "epoch": 0.8161967938087341, + "grad_norm": 3.5119783878326416, + "learning_rate": 9.451392836839115e-06, + "loss": 4.7924, + "step": 14765 + }, + { + "epoch": 0.8164731896075179, + "grad_norm": 3.4265787601470947, + "learning_rate": 9.437180216031837e-06, + "loss": 5.0749, + "step": 14770 + }, + { + "epoch": 0.8167495854063018, + "grad_norm": 3.3151018619537354, + "learning_rate": 9.42296759522456e-06, + "loss": 4.7777, + "step": 14775 + }, + { + "epoch": 0.8170259812050856, + "grad_norm": 3.0593369007110596, + "learning_rate": 9.408754974417283e-06, + "loss": 4.9151, + "step": 14780 + }, + { + "epoch": 0.8173023770038695, + "grad_norm": 3.627476930618286, + "learning_rate": 9.394542353610005e-06, + "loss": 5.2975, + "step": 14785 + }, + { + "epoch": 0.8175787728026535, + "grad_norm": 3.161733388900757, + "learning_rate": 9.38032973280273e-06, + "loss": 5.0477, + "step": 14790 + }, + { + "epoch": 0.8178551686014373, + "grad_norm": 3.5496115684509277, + "learning_rate": 9.366117111995452e-06, + "loss": 5.0122, + "step": 14795 + }, + { + "epoch": 0.8181315644002212, + "grad_norm": 3.702653169631958, + "learning_rate": 9.351904491188176e-06, + "loss": 4.7585, + "step": 14800 + }, + { + "epoch": 0.818407960199005, + "grad_norm": 3.9237051010131836, + "learning_rate": 9.3376918703809e-06, + "loss": 4.6184, + "step": 14805 + }, + { + "epoch": 0.8186843559977889, + "grad_norm": 3.077275276184082, + "learning_rate": 9.323479249573622e-06, + "loss": 4.6302, + "step": 14810 + }, + { + "epoch": 0.8189607517965727, + "grad_norm": 3.5210278034210205, + "learning_rate": 9.309266628766344e-06, + "loss": 5.0052, + "step": 14815 + }, + { + "epoch": 0.8192371475953566, + "grad_norm": 3.1332850456237793, + "learning_rate": 9.295054007959068e-06, + "loss": 4.9236, + "step": 14820 + }, + { + "epoch": 0.8195135433941404, + "grad_norm": 3.8537850379943848, + "learning_rate": 9.28084138715179e-06, + "loss": 5.0165, + "step": 14825 + }, + { + "epoch": 0.8197899391929243, + "grad_norm": 3.768019199371338, + "learning_rate": 9.266628766344514e-06, + "loss": 4.7222, + "step": 14830 + }, + { + "epoch": 0.8200663349917081, + "grad_norm": 4.06325101852417, + "learning_rate": 9.252416145537238e-06, + "loss": 4.942, + "step": 14835 + }, + { + "epoch": 0.820342730790492, + "grad_norm": 3.555703639984131, + "learning_rate": 9.23820352472996e-06, + "loss": 5.0981, + "step": 14840 + }, + { + "epoch": 0.8206191265892758, + "grad_norm": 3.216472625732422, + "learning_rate": 9.223990903922685e-06, + "loss": 5.1655, + "step": 14845 + }, + { + "epoch": 0.8208955223880597, + "grad_norm": 2.9533238410949707, + "learning_rate": 9.209778283115407e-06, + "loss": 4.868, + "step": 14850 + }, + { + "epoch": 0.8211719181868435, + "grad_norm": 3.6127288341522217, + "learning_rate": 9.19556566230813e-06, + "loss": 5.1768, + "step": 14855 + }, + { + "epoch": 0.8214483139856275, + "grad_norm": 3.685889720916748, + "learning_rate": 9.181353041500853e-06, + "loss": 4.8842, + "step": 14860 + }, + { + "epoch": 0.8217247097844113, + "grad_norm": 3.554274559020996, + "learning_rate": 9.167140420693576e-06, + "loss": 4.7565, + "step": 14865 + }, + { + "epoch": 0.8220011055831952, + "grad_norm": 4.190557956695557, + "learning_rate": 9.1529277998863e-06, + "loss": 4.8703, + "step": 14870 + }, + { + "epoch": 0.822277501381979, + "grad_norm": 3.2858259677886963, + "learning_rate": 9.138715179079023e-06, + "loss": 4.8811, + "step": 14875 + }, + { + "epoch": 0.8225538971807629, + "grad_norm": 4.575525760650635, + "learning_rate": 9.124502558271746e-06, + "loss": 5.1692, + "step": 14880 + }, + { + "epoch": 0.8228302929795467, + "grad_norm": 3.1636838912963867, + "learning_rate": 9.11028993746447e-06, + "loss": 4.8458, + "step": 14885 + }, + { + "epoch": 0.8231066887783306, + "grad_norm": 3.5616745948791504, + "learning_rate": 9.096077316657192e-06, + "loss": 4.9314, + "step": 14890 + }, + { + "epoch": 0.8233830845771144, + "grad_norm": 4.337357044219971, + "learning_rate": 9.081864695849914e-06, + "loss": 5.0281, + "step": 14895 + }, + { + "epoch": 0.8236594803758983, + "grad_norm": 3.906620502471924, + "learning_rate": 9.067652075042638e-06, + "loss": 4.895, + "step": 14900 + }, + { + "epoch": 0.8239358761746821, + "grad_norm": 3.766569137573242, + "learning_rate": 9.053439454235362e-06, + "loss": 4.7679, + "step": 14905 + }, + { + "epoch": 0.824212271973466, + "grad_norm": 3.7413177490234375, + "learning_rate": 9.039226833428085e-06, + "loss": 4.8692, + "step": 14910 + }, + { + "epoch": 0.8244886677722498, + "grad_norm": 3.4053595066070557, + "learning_rate": 9.025014212620809e-06, + "loss": 4.625, + "step": 14915 + }, + { + "epoch": 0.8247650635710337, + "grad_norm": 4.579228401184082, + "learning_rate": 9.01080159181353e-06, + "loss": 4.6284, + "step": 14920 + }, + { + "epoch": 0.8250414593698175, + "grad_norm": 4.081772327423096, + "learning_rate": 8.996588971006253e-06, + "loss": 4.9273, + "step": 14925 + }, + { + "epoch": 0.8253178551686015, + "grad_norm": 3.704084634780884, + "learning_rate": 8.982376350198977e-06, + "loss": 4.8167, + "step": 14930 + }, + { + "epoch": 0.8255942509673853, + "grad_norm": 4.213784217834473, + "learning_rate": 8.9681637293917e-06, + "loss": 4.8445, + "step": 14935 + }, + { + "epoch": 0.8258706467661692, + "grad_norm": 3.3844199180603027, + "learning_rate": 8.953951108584423e-06, + "loss": 4.9753, + "step": 14940 + }, + { + "epoch": 0.826147042564953, + "grad_norm": 4.304365158081055, + "learning_rate": 8.939738487777147e-06, + "loss": 4.9407, + "step": 14945 + }, + { + "epoch": 0.8264234383637369, + "grad_norm": 5.198899269104004, + "learning_rate": 8.92552586696987e-06, + "loss": 5.0407, + "step": 14950 + }, + { + "epoch": 0.8266998341625207, + "grad_norm": 3.2233057022094727, + "learning_rate": 8.911313246162594e-06, + "loss": 4.6535, + "step": 14955 + }, + { + "epoch": 0.8269762299613046, + "grad_norm": 3.325000286102295, + "learning_rate": 8.897100625355316e-06, + "loss": 4.9527, + "step": 14960 + }, + { + "epoch": 0.8272526257600884, + "grad_norm": 4.237645626068115, + "learning_rate": 8.882888004548038e-06, + "loss": 4.831, + "step": 14965 + }, + { + "epoch": 0.8275290215588723, + "grad_norm": 3.883349895477295, + "learning_rate": 8.868675383740762e-06, + "loss": 5.2111, + "step": 14970 + }, + { + "epoch": 0.8278054173576561, + "grad_norm": 4.307447910308838, + "learning_rate": 8.854462762933486e-06, + "loss": 4.8234, + "step": 14975 + }, + { + "epoch": 0.82808181315644, + "grad_norm": 3.646482229232788, + "learning_rate": 8.840250142126208e-06, + "loss": 4.6802, + "step": 14980 + }, + { + "epoch": 0.8283582089552238, + "grad_norm": 4.150384902954102, + "learning_rate": 8.826037521318932e-06, + "loss": 4.939, + "step": 14985 + }, + { + "epoch": 0.8286346047540077, + "grad_norm": 3.2273035049438477, + "learning_rate": 8.811824900511655e-06, + "loss": 4.758, + "step": 14990 + }, + { + "epoch": 0.8289110005527915, + "grad_norm": 3.5394163131713867, + "learning_rate": 8.797612279704377e-06, + "loss": 5.039, + "step": 14995 + }, + { + "epoch": 0.8291873963515755, + "grad_norm": 3.0084025859832764, + "learning_rate": 8.783399658897101e-06, + "loss": 5.2781, + "step": 15000 + }, + { + "epoch": 0.8294637921503594, + "grad_norm": 3.8212339878082275, + "learning_rate": 8.769187038089823e-06, + "loss": 4.8291, + "step": 15005 + }, + { + "epoch": 0.8297401879491432, + "grad_norm": 3.359372854232788, + "learning_rate": 8.754974417282547e-06, + "loss": 4.9139, + "step": 15010 + }, + { + "epoch": 0.8300165837479271, + "grad_norm": 3.9898805618286133, + "learning_rate": 8.740761796475271e-06, + "loss": 4.738, + "step": 15015 + }, + { + "epoch": 0.8302929795467109, + "grad_norm": 3.4556021690368652, + "learning_rate": 8.726549175667993e-06, + "loss": 4.6281, + "step": 15020 + }, + { + "epoch": 0.8305693753454948, + "grad_norm": 4.344873905181885, + "learning_rate": 8.712336554860717e-06, + "loss": 5.041, + "step": 15025 + }, + { + "epoch": 0.8308457711442786, + "grad_norm": 4.770878314971924, + "learning_rate": 8.69812393405344e-06, + "loss": 4.8658, + "step": 15030 + }, + { + "epoch": 0.8311221669430625, + "grad_norm": 3.748448610305786, + "learning_rate": 8.683911313246162e-06, + "loss": 4.969, + "step": 15035 + }, + { + "epoch": 0.8313985627418463, + "grad_norm": 4.457357883453369, + "learning_rate": 8.669698692438886e-06, + "loss": 5.2523, + "step": 15040 + }, + { + "epoch": 0.8316749585406302, + "grad_norm": 4.152907371520996, + "learning_rate": 8.65548607163161e-06, + "loss": 4.8646, + "step": 15045 + }, + { + "epoch": 0.831951354339414, + "grad_norm": 3.441721200942993, + "learning_rate": 8.641273450824332e-06, + "loss": 4.8011, + "step": 15050 + }, + { + "epoch": 0.8322277501381979, + "grad_norm": 4.53822660446167, + "learning_rate": 8.627060830017056e-06, + "loss": 5.184, + "step": 15055 + }, + { + "epoch": 0.8325041459369817, + "grad_norm": 3.868192195892334, + "learning_rate": 8.612848209209779e-06, + "loss": 4.8637, + "step": 15060 + }, + { + "epoch": 0.8327805417357657, + "grad_norm": 3.9942774772644043, + "learning_rate": 8.598635588402502e-06, + "loss": 4.8782, + "step": 15065 + }, + { + "epoch": 0.8330569375345495, + "grad_norm": 2.853957176208496, + "learning_rate": 8.584422967595225e-06, + "loss": 4.9473, + "step": 15070 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 3.749110460281372, + "learning_rate": 8.570210346787947e-06, + "loss": 5.012, + "step": 15075 + }, + { + "epoch": 0.8336097291321172, + "grad_norm": 4.649420261383057, + "learning_rate": 8.555997725980671e-06, + "loss": 4.7911, + "step": 15080 + }, + { + "epoch": 0.8338861249309011, + "grad_norm": 3.9291510581970215, + "learning_rate": 8.541785105173395e-06, + "loss": 5.1986, + "step": 15085 + }, + { + "epoch": 0.8341625207296849, + "grad_norm": 3.576134443283081, + "learning_rate": 8.527572484366117e-06, + "loss": 5.0059, + "step": 15090 + }, + { + "epoch": 0.8344389165284688, + "grad_norm": 3.902698516845703, + "learning_rate": 8.513359863558841e-06, + "loss": 4.8758, + "step": 15095 + }, + { + "epoch": 0.8347153123272526, + "grad_norm": 4.259060859680176, + "learning_rate": 8.499147242751564e-06, + "loss": 4.8227, + "step": 15100 + }, + { + "epoch": 0.8349917081260365, + "grad_norm": 3.539156436920166, + "learning_rate": 8.484934621944286e-06, + "loss": 5.0189, + "step": 15105 + }, + { + "epoch": 0.8352681039248203, + "grad_norm": 4.712406635284424, + "learning_rate": 8.47072200113701e-06, + "loss": 5.0471, + "step": 15110 + }, + { + "epoch": 0.8355444997236042, + "grad_norm": 3.5508053302764893, + "learning_rate": 8.456509380329732e-06, + "loss": 4.8707, + "step": 15115 + }, + { + "epoch": 0.835820895522388, + "grad_norm": 3.3851168155670166, + "learning_rate": 8.442296759522458e-06, + "loss": 4.8683, + "step": 15120 + }, + { + "epoch": 0.8360972913211719, + "grad_norm": 3.6790895462036133, + "learning_rate": 8.42808413871518e-06, + "loss": 4.8464, + "step": 15125 + }, + { + "epoch": 0.8363736871199557, + "grad_norm": 3.053698778152466, + "learning_rate": 8.413871517907902e-06, + "loss": 4.9889, + "step": 15130 + }, + { + "epoch": 0.8366500829187397, + "grad_norm": 4.331393718719482, + "learning_rate": 8.399658897100626e-06, + "loss": 5.4066, + "step": 15135 + }, + { + "epoch": 0.8369264787175235, + "grad_norm": 3.729410171508789, + "learning_rate": 8.385446276293349e-06, + "loss": 5.0611, + "step": 15140 + }, + { + "epoch": 0.8372028745163074, + "grad_norm": 3.60453724861145, + "learning_rate": 8.371233655486071e-06, + "loss": 4.8469, + "step": 15145 + }, + { + "epoch": 0.8374792703150912, + "grad_norm": 4.959109306335449, + "learning_rate": 8.357021034678795e-06, + "loss": 4.938, + "step": 15150 + }, + { + "epoch": 0.8377556661138751, + "grad_norm": 4.496500015258789, + "learning_rate": 8.342808413871519e-06, + "loss": 4.7706, + "step": 15155 + }, + { + "epoch": 0.8380320619126589, + "grad_norm": 4.093534469604492, + "learning_rate": 8.328595793064241e-06, + "loss": 5.1374, + "step": 15160 + }, + { + "epoch": 0.8383084577114428, + "grad_norm": 3.628263473510742, + "learning_rate": 8.314383172256965e-06, + "loss": 4.7076, + "step": 15165 + }, + { + "epoch": 0.8385848535102266, + "grad_norm": 3.8275880813598633, + "learning_rate": 8.300170551449687e-06, + "loss": 4.5418, + "step": 15170 + }, + { + "epoch": 0.8388612493090105, + "grad_norm": 4.351752758026123, + "learning_rate": 8.285957930642411e-06, + "loss": 4.903, + "step": 15175 + }, + { + "epoch": 0.8391376451077943, + "grad_norm": 3.621277093887329, + "learning_rate": 8.271745309835134e-06, + "loss": 4.9818, + "step": 15180 + }, + { + "epoch": 0.8394140409065782, + "grad_norm": 3.3908252716064453, + "learning_rate": 8.257532689027856e-06, + "loss": 5.084, + "step": 15185 + }, + { + "epoch": 0.839690436705362, + "grad_norm": 3.7233057022094727, + "learning_rate": 8.24332006822058e-06, + "loss": 5.2591, + "step": 15190 + }, + { + "epoch": 0.8399668325041459, + "grad_norm": 3.7437429428100586, + "learning_rate": 8.229107447413304e-06, + "loss": 4.817, + "step": 15195 + }, + { + "epoch": 0.8402432283029297, + "grad_norm": 3.7888224124908447, + "learning_rate": 8.214894826606026e-06, + "loss": 4.598, + "step": 15200 + }, + { + "epoch": 0.8405196241017137, + "grad_norm": 3.3755719661712646, + "learning_rate": 8.20068220579875e-06, + "loss": 5.1114, + "step": 15205 + }, + { + "epoch": 0.8407960199004975, + "grad_norm": 3.7054946422576904, + "learning_rate": 8.186469584991472e-06, + "loss": 4.6614, + "step": 15210 + }, + { + "epoch": 0.8410724156992814, + "grad_norm": 3.747762441635132, + "learning_rate": 8.172256964184195e-06, + "loss": 4.7052, + "step": 15215 + }, + { + "epoch": 0.8413488114980652, + "grad_norm": 3.4834938049316406, + "learning_rate": 8.158044343376919e-06, + "loss": 5.1221, + "step": 15220 + }, + { + "epoch": 0.8416252072968491, + "grad_norm": 3.943834066390991, + "learning_rate": 8.143831722569643e-06, + "loss": 4.9175, + "step": 15225 + }, + { + "epoch": 0.841901603095633, + "grad_norm": 3.3898112773895264, + "learning_rate": 8.129619101762367e-06, + "loss": 5.0414, + "step": 15230 + }, + { + "epoch": 0.8421779988944168, + "grad_norm": 3.7850241661071777, + "learning_rate": 8.115406480955089e-06, + "loss": 4.9141, + "step": 15235 + }, + { + "epoch": 0.8424543946932007, + "grad_norm": 3.2577099800109863, + "learning_rate": 8.101193860147811e-06, + "loss": 4.7304, + "step": 15240 + }, + { + "epoch": 0.8427307904919845, + "grad_norm": 3.3998703956604004, + "learning_rate": 8.086981239340535e-06, + "loss": 4.9697, + "step": 15245 + }, + { + "epoch": 0.8430071862907684, + "grad_norm": 4.133670330047607, + "learning_rate": 8.072768618533258e-06, + "loss": 4.9938, + "step": 15250 + }, + { + "epoch": 0.8432835820895522, + "grad_norm": 4.74376916885376, + "learning_rate": 8.05855599772598e-06, + "loss": 4.8981, + "step": 15255 + }, + { + "epoch": 0.8435599778883361, + "grad_norm": 3.422651529312134, + "learning_rate": 8.044343376918704e-06, + "loss": 4.9323, + "step": 15260 + }, + { + "epoch": 0.84383637368712, + "grad_norm": 4.132688999176025, + "learning_rate": 8.030130756111428e-06, + "loss": 4.8445, + "step": 15265 + }, + { + "epoch": 0.8441127694859039, + "grad_norm": 4.657894134521484, + "learning_rate": 8.01591813530415e-06, + "loss": 4.7014, + "step": 15270 + }, + { + "epoch": 0.8443891652846877, + "grad_norm": 4.646186828613281, + "learning_rate": 8.001705514496874e-06, + "loss": 5.2851, + "step": 15275 + }, + { + "epoch": 0.8446655610834716, + "grad_norm": 4.24678897857666, + "learning_rate": 7.987492893689596e-06, + "loss": 4.6824, + "step": 15280 + }, + { + "epoch": 0.8449419568822554, + "grad_norm": 3.599647045135498, + "learning_rate": 7.97328027288232e-06, + "loss": 4.8003, + "step": 15285 + }, + { + "epoch": 0.8452183526810393, + "grad_norm": 3.2087178230285645, + "learning_rate": 7.959067652075043e-06, + "loss": 4.6724, + "step": 15290 + }, + { + "epoch": 0.8454947484798231, + "grad_norm": 3.3129324913024902, + "learning_rate": 7.944855031267767e-06, + "loss": 4.9888, + "step": 15295 + }, + { + "epoch": 0.845771144278607, + "grad_norm": 4.16626501083374, + "learning_rate": 7.93064241046049e-06, + "loss": 4.9972, + "step": 15300 + }, + { + "epoch": 0.8460475400773908, + "grad_norm": 3.8757193088531494, + "learning_rate": 7.916429789653213e-06, + "loss": 4.9566, + "step": 15305 + }, + { + "epoch": 0.8463239358761747, + "grad_norm": 3.9059760570526123, + "learning_rate": 7.902217168845935e-06, + "loss": 4.7056, + "step": 15310 + }, + { + "epoch": 0.8466003316749585, + "grad_norm": 3.2571303844451904, + "learning_rate": 7.888004548038659e-06, + "loss": 5.0104, + "step": 15315 + }, + { + "epoch": 0.8468767274737424, + "grad_norm": 3.6211488246917725, + "learning_rate": 7.873791927231381e-06, + "loss": 4.9973, + "step": 15320 + }, + { + "epoch": 0.8471531232725262, + "grad_norm": 4.511015892028809, + "learning_rate": 7.859579306424104e-06, + "loss": 4.8835, + "step": 15325 + }, + { + "epoch": 0.8474295190713101, + "grad_norm": 3.3514885902404785, + "learning_rate": 7.845366685616828e-06, + "loss": 5.1082, + "step": 15330 + }, + { + "epoch": 0.847705914870094, + "grad_norm": 4.153277397155762, + "learning_rate": 7.831154064809552e-06, + "loss": 5.1117, + "step": 15335 + }, + { + "epoch": 0.8479823106688779, + "grad_norm": 3.661910057067871, + "learning_rate": 7.816941444002276e-06, + "loss": 4.785, + "step": 15340 + }, + { + "epoch": 0.8482587064676617, + "grad_norm": 3.4714293479919434, + "learning_rate": 7.802728823194998e-06, + "loss": 4.9615, + "step": 15345 + }, + { + "epoch": 0.8485351022664456, + "grad_norm": 3.412801504135132, + "learning_rate": 7.78851620238772e-06, + "loss": 4.7827, + "step": 15350 + }, + { + "epoch": 0.8488114980652294, + "grad_norm": 4.46990966796875, + "learning_rate": 7.774303581580444e-06, + "loss": 4.8436, + "step": 15355 + }, + { + "epoch": 0.8490878938640133, + "grad_norm": 4.014577388763428, + "learning_rate": 7.760090960773166e-06, + "loss": 4.9757, + "step": 15360 + }, + { + "epoch": 0.8493642896627971, + "grad_norm": 3.89868426322937, + "learning_rate": 7.74587833996589e-06, + "loss": 4.8749, + "step": 15365 + }, + { + "epoch": 0.849640685461581, + "grad_norm": 2.8966128826141357, + "learning_rate": 7.731665719158614e-06, + "loss": 4.7, + "step": 15370 + }, + { + "epoch": 0.8499170812603648, + "grad_norm": 4.55120849609375, + "learning_rate": 7.717453098351337e-06, + "loss": 4.7576, + "step": 15375 + }, + { + "epoch": 0.8501934770591487, + "grad_norm": 4.043015003204346, + "learning_rate": 7.703240477544059e-06, + "loss": 4.7656, + "step": 15380 + }, + { + "epoch": 0.8504698728579325, + "grad_norm": 4.398715019226074, + "learning_rate": 7.689027856736783e-06, + "loss": 4.7285, + "step": 15385 + }, + { + "epoch": 0.8507462686567164, + "grad_norm": 4.311310291290283, + "learning_rate": 7.674815235929505e-06, + "loss": 4.8688, + "step": 15390 + }, + { + "epoch": 0.8510226644555002, + "grad_norm": 3.4514060020446777, + "learning_rate": 7.66060261512223e-06, + "loss": 4.7912, + "step": 15395 + }, + { + "epoch": 0.8512990602542841, + "grad_norm": 3.418095588684082, + "learning_rate": 7.646389994314952e-06, + "loss": 4.8534, + "step": 15400 + }, + { + "epoch": 0.851575456053068, + "grad_norm": 4.0209503173828125, + "learning_rate": 7.632177373507675e-06, + "loss": 5.2002, + "step": 15405 + }, + { + "epoch": 0.8518518518518519, + "grad_norm": 2.8665504455566406, + "learning_rate": 7.617964752700399e-06, + "loss": 5.142, + "step": 15410 + }, + { + "epoch": 0.8521282476506357, + "grad_norm": 3.491189956665039, + "learning_rate": 7.603752131893122e-06, + "loss": 4.6841, + "step": 15415 + }, + { + "epoch": 0.8524046434494196, + "grad_norm": 3.8220739364624023, + "learning_rate": 7.589539511085844e-06, + "loss": 4.929, + "step": 15420 + }, + { + "epoch": 0.8526810392482034, + "grad_norm": 3.5828657150268555, + "learning_rate": 7.575326890278568e-06, + "loss": 5.0016, + "step": 15425 + }, + { + "epoch": 0.8529574350469873, + "grad_norm": 4.3814921379089355, + "learning_rate": 7.561114269471291e-06, + "loss": 4.8032, + "step": 15430 + }, + { + "epoch": 0.8532338308457711, + "grad_norm": 3.4918086528778076, + "learning_rate": 7.5469016486640134e-06, + "loss": 4.846, + "step": 15435 + }, + { + "epoch": 0.853510226644555, + "grad_norm": 3.771287679672241, + "learning_rate": 7.532689027856737e-06, + "loss": 4.791, + "step": 15440 + }, + { + "epoch": 0.8537866224433389, + "grad_norm": 3.5989866256713867, + "learning_rate": 7.51847640704946e-06, + "loss": 4.9933, + "step": 15445 + }, + { + "epoch": 0.8540630182421227, + "grad_norm": 3.3803904056549072, + "learning_rate": 7.504263786242183e-06, + "loss": 4.8067, + "step": 15450 + }, + { + "epoch": 0.8543394140409066, + "grad_norm": 4.378183841705322, + "learning_rate": 7.490051165434907e-06, + "loss": 5.3456, + "step": 15455 + }, + { + "epoch": 0.8546158098396904, + "grad_norm": 4.145111560821533, + "learning_rate": 7.475838544627629e-06, + "loss": 4.833, + "step": 15460 + }, + { + "epoch": 0.8548922056384743, + "grad_norm": 3.1699306964874268, + "learning_rate": 7.461625923820353e-06, + "loss": 4.7371, + "step": 15465 + }, + { + "epoch": 0.8551686014372581, + "grad_norm": 3.570518732070923, + "learning_rate": 7.447413303013076e-06, + "loss": 5.0778, + "step": 15470 + }, + { + "epoch": 0.8554449972360421, + "grad_norm": 3.922854423522949, + "learning_rate": 7.4332006822057985e-06, + "loss": 4.9517, + "step": 15475 + }, + { + "epoch": 0.8557213930348259, + "grad_norm": 4.147318363189697, + "learning_rate": 7.4189880613985225e-06, + "loss": 4.9921, + "step": 15480 + }, + { + "epoch": 0.8559977888336098, + "grad_norm": 3.7411043643951416, + "learning_rate": 7.404775440591246e-06, + "loss": 5.2079, + "step": 15485 + }, + { + "epoch": 0.8562741846323936, + "grad_norm": 3.129054069519043, + "learning_rate": 7.390562819783968e-06, + "loss": 4.8514, + "step": 15490 + }, + { + "epoch": 0.8565505804311775, + "grad_norm": 3.2814624309539795, + "learning_rate": 7.376350198976692e-06, + "loss": 4.9005, + "step": 15495 + }, + { + "epoch": 0.8568269762299613, + "grad_norm": 4.446510314941406, + "learning_rate": 7.362137578169415e-06, + "loss": 4.6255, + "step": 15500 + }, + { + "epoch": 0.8571033720287452, + "grad_norm": 4.147850036621094, + "learning_rate": 7.347924957362137e-06, + "loss": 4.7585, + "step": 15505 + }, + { + "epoch": 0.857379767827529, + "grad_norm": 3.926903247833252, + "learning_rate": 7.333712336554861e-06, + "loss": 5.0353, + "step": 15510 + }, + { + "epoch": 0.8576561636263129, + "grad_norm": 4.027170658111572, + "learning_rate": 7.3194997157475835e-06, + "loss": 4.9129, + "step": 15515 + }, + { + "epoch": 0.8579325594250967, + "grad_norm": 3.412487030029297, + "learning_rate": 7.3052870949403075e-06, + "loss": 4.8461, + "step": 15520 + }, + { + "epoch": 0.8582089552238806, + "grad_norm": 4.255180358886719, + "learning_rate": 7.291074474133031e-06, + "loss": 5.133, + "step": 15525 + }, + { + "epoch": 0.8584853510226644, + "grad_norm": 3.505549192428589, + "learning_rate": 7.276861853325753e-06, + "loss": 5.1632, + "step": 15530 + }, + { + "epoch": 0.8587617468214483, + "grad_norm": 3.490243434906006, + "learning_rate": 7.262649232518477e-06, + "loss": 4.8109, + "step": 15535 + }, + { + "epoch": 0.8590381426202321, + "grad_norm": 3.9529364109039307, + "learning_rate": 7.2484366117112e-06, + "loss": 4.7609, + "step": 15540 + }, + { + "epoch": 0.8593145384190161, + "grad_norm": 4.20280122756958, + "learning_rate": 7.234223990903922e-06, + "loss": 4.8306, + "step": 15545 + }, + { + "epoch": 0.8595909342177999, + "grad_norm": 3.574753761291504, + "learning_rate": 7.220011370096646e-06, + "loss": 5.051, + "step": 15550 + }, + { + "epoch": 0.8598673300165838, + "grad_norm": 3.6286284923553467, + "learning_rate": 7.2057987492893694e-06, + "loss": 4.7709, + "step": 15555 + }, + { + "epoch": 0.8601437258153676, + "grad_norm": 5.038646697998047, + "learning_rate": 7.191586128482092e-06, + "loss": 4.9982, + "step": 15560 + }, + { + "epoch": 0.8604201216141515, + "grad_norm": 3.0044443607330322, + "learning_rate": 7.177373507674816e-06, + "loss": 4.8929, + "step": 15565 + }, + { + "epoch": 0.8606965174129353, + "grad_norm": 3.639540672302246, + "learning_rate": 7.163160886867539e-06, + "loss": 4.9908, + "step": 15570 + }, + { + "epoch": 0.8609729132117192, + "grad_norm": 3.9962198734283447, + "learning_rate": 7.148948266060263e-06, + "loss": 4.6839, + "step": 15575 + }, + { + "epoch": 0.861249309010503, + "grad_norm": 3.4947597980499268, + "learning_rate": 7.134735645252985e-06, + "loss": 4.9089, + "step": 15580 + }, + { + "epoch": 0.8615257048092869, + "grad_norm": 3.2274651527404785, + "learning_rate": 7.120523024445707e-06, + "loss": 4.6201, + "step": 15585 + }, + { + "epoch": 0.8618021006080707, + "grad_norm": 4.237607955932617, + "learning_rate": 7.106310403638431e-06, + "loss": 5.0777, + "step": 15590 + }, + { + "epoch": 0.8620784964068546, + "grad_norm": 2.7979462146759033, + "learning_rate": 7.0920977828311545e-06, + "loss": 4.7732, + "step": 15595 + }, + { + "epoch": 0.8623548922056384, + "grad_norm": 3.392643928527832, + "learning_rate": 7.077885162023877e-06, + "loss": 4.8292, + "step": 15600 + }, + { + "epoch": 0.8626312880044223, + "grad_norm": 4.662559509277344, + "learning_rate": 7.063672541216601e-06, + "loss": 4.7918, + "step": 15605 + }, + { + "epoch": 0.8629076838032061, + "grad_norm": 3.801340341567993, + "learning_rate": 7.049459920409324e-06, + "loss": 5.173, + "step": 15610 + }, + { + "epoch": 0.8631840796019901, + "grad_norm": 3.8719255924224854, + "learning_rate": 7.035247299602046e-06, + "loss": 4.9831, + "step": 15615 + }, + { + "epoch": 0.8634604754007739, + "grad_norm": 3.9387316703796387, + "learning_rate": 7.02103467879477e-06, + "loss": 4.8149, + "step": 15620 + }, + { + "epoch": 0.8637368711995578, + "grad_norm": 3.568291664123535, + "learning_rate": 7.006822057987493e-06, + "loss": 4.8419, + "step": 15625 + }, + { + "epoch": 0.8640132669983416, + "grad_norm": 4.031901836395264, + "learning_rate": 6.992609437180217e-06, + "loss": 4.6986, + "step": 15630 + }, + { + "epoch": 0.8642896627971255, + "grad_norm": 4.286612033843994, + "learning_rate": 6.9783968163729396e-06, + "loss": 4.9399, + "step": 15635 + }, + { + "epoch": 0.8645660585959093, + "grad_norm": 3.5853421688079834, + "learning_rate": 6.964184195565662e-06, + "loss": 5.2017, + "step": 15640 + }, + { + "epoch": 0.8648424543946932, + "grad_norm": 4.815415382385254, + "learning_rate": 6.949971574758387e-06, + "loss": 5.2114, + "step": 15645 + }, + { + "epoch": 0.865118850193477, + "grad_norm": 3.401326894760132, + "learning_rate": 6.935758953951109e-06, + "loss": 4.821, + "step": 15650 + }, + { + "epoch": 0.8653952459922609, + "grad_norm": 3.949343681335449, + "learning_rate": 6.921546333143831e-06, + "loss": 5.2086, + "step": 15655 + }, + { + "epoch": 0.8656716417910447, + "grad_norm": 4.092352867126465, + "learning_rate": 6.907333712336555e-06, + "loss": 4.8635, + "step": 15660 + }, + { + "epoch": 0.8659480375898286, + "grad_norm": 4.154953956604004, + "learning_rate": 6.893121091529278e-06, + "loss": 4.7274, + "step": 15665 + }, + { + "epoch": 0.8662244333886125, + "grad_norm": 3.2108614444732666, + "learning_rate": 6.878908470722001e-06, + "loss": 5.0751, + "step": 15670 + }, + { + "epoch": 0.8665008291873963, + "grad_norm": 3.6406795978546143, + "learning_rate": 6.864695849914725e-06, + "loss": 4.935, + "step": 15675 + }, + { + "epoch": 0.8667772249861803, + "grad_norm": 3.644953727722168, + "learning_rate": 6.850483229107448e-06, + "loss": 5.0036, + "step": 15680 + }, + { + "epoch": 0.8670536207849641, + "grad_norm": 3.906993865966797, + "learning_rate": 6.836270608300172e-06, + "loss": 4.7136, + "step": 15685 + }, + { + "epoch": 0.867330016583748, + "grad_norm": 3.677238702774048, + "learning_rate": 6.822057987492894e-06, + "loss": 4.8841, + "step": 15690 + }, + { + "epoch": 0.8676064123825318, + "grad_norm": 3.396554946899414, + "learning_rate": 6.807845366685617e-06, + "loss": 4.9053, + "step": 15695 + }, + { + "epoch": 0.8678828081813157, + "grad_norm": 4.678235054016113, + "learning_rate": 6.793632745878341e-06, + "loss": 4.7476, + "step": 15700 + }, + { + "epoch": 0.8681592039800995, + "grad_norm": 3.4147768020629883, + "learning_rate": 6.779420125071063e-06, + "loss": 5.0731, + "step": 15705 + }, + { + "epoch": 0.8684355997788834, + "grad_norm": 4.460427761077881, + "learning_rate": 6.765207504263786e-06, + "loss": 4.9435, + "step": 15710 + }, + { + "epoch": 0.8687119955776672, + "grad_norm": 4.205367565155029, + "learning_rate": 6.75099488345651e-06, + "loss": 4.7848, + "step": 15715 + }, + { + "epoch": 0.8689883913764511, + "grad_norm": 3.436741828918457, + "learning_rate": 6.736782262649233e-06, + "loss": 4.7811, + "step": 15720 + }, + { + "epoch": 0.8692647871752349, + "grad_norm": 3.21610164642334, + "learning_rate": 6.722569641841955e-06, + "loss": 4.8657, + "step": 15725 + }, + { + "epoch": 0.8695411829740188, + "grad_norm": 3.93279767036438, + "learning_rate": 6.708357021034679e-06, + "loss": 4.8837, + "step": 15730 + }, + { + "epoch": 0.8698175787728026, + "grad_norm": 3.3323113918304443, + "learning_rate": 6.694144400227402e-06, + "loss": 5.143, + "step": 15735 + }, + { + "epoch": 0.8700939745715865, + "grad_norm": 3.971737861633301, + "learning_rate": 6.679931779420126e-06, + "loss": 4.9652, + "step": 15740 + }, + { + "epoch": 0.8703703703703703, + "grad_norm": 3.596555471420288, + "learning_rate": 6.6657191586128485e-06, + "loss": 4.9259, + "step": 15745 + }, + { + "epoch": 0.8706467661691543, + "grad_norm": 3.582368850708008, + "learning_rate": 6.651506537805572e-06, + "loss": 5.0243, + "step": 15750 + }, + { + "epoch": 0.8709231619679381, + "grad_norm": 3.9322404861450195, + "learning_rate": 6.637293916998296e-06, + "loss": 5.2276, + "step": 15755 + }, + { + "epoch": 0.871199557766722, + "grad_norm": 4.058863162994385, + "learning_rate": 6.623081296191018e-06, + "loss": 5.2218, + "step": 15760 + }, + { + "epoch": 0.8714759535655058, + "grad_norm": 3.4161527156829834, + "learning_rate": 6.608868675383741e-06, + "loss": 4.6876, + "step": 15765 + }, + { + "epoch": 0.8717523493642897, + "grad_norm": 2.900460958480835, + "learning_rate": 6.594656054576465e-06, + "loss": 4.7734, + "step": 15770 + }, + { + "epoch": 0.8720287451630735, + "grad_norm": 3.5918712615966797, + "learning_rate": 6.580443433769187e-06, + "loss": 4.7184, + "step": 15775 + }, + { + "epoch": 0.8723051409618574, + "grad_norm": 3.992398500442505, + "learning_rate": 6.5662308129619095e-06, + "loss": 4.966, + "step": 15780 + }, + { + "epoch": 0.8725815367606412, + "grad_norm": 3.817514419555664, + "learning_rate": 6.5520181921546335e-06, + "loss": 5.0287, + "step": 15785 + }, + { + "epoch": 0.8728579325594251, + "grad_norm": 3.5824077129364014, + "learning_rate": 6.537805571347357e-06, + "loss": 4.9329, + "step": 15790 + }, + { + "epoch": 0.8731343283582089, + "grad_norm": 3.7401037216186523, + "learning_rate": 6.523592950540081e-06, + "loss": 4.9287, + "step": 15795 + }, + { + "epoch": 0.8734107241569928, + "grad_norm": 3.4796454906463623, + "learning_rate": 6.509380329732803e-06, + "loss": 4.9727, + "step": 15800 + }, + { + "epoch": 0.8736871199557766, + "grad_norm": 5.003295421600342, + "learning_rate": 6.495167708925526e-06, + "loss": 4.7314, + "step": 15805 + }, + { + "epoch": 0.8739635157545605, + "grad_norm": 4.505870819091797, + "learning_rate": 6.48095508811825e-06, + "loss": 5.0809, + "step": 15810 + }, + { + "epoch": 0.8742399115533444, + "grad_norm": 3.863006114959717, + "learning_rate": 6.466742467310972e-06, + "loss": 4.6385, + "step": 15815 + }, + { + "epoch": 0.8745163073521283, + "grad_norm": 3.4922070503234863, + "learning_rate": 6.4525298465036955e-06, + "loss": 4.7127, + "step": 15820 + }, + { + "epoch": 0.8747927031509121, + "grad_norm": 3.6463775634765625, + "learning_rate": 6.4383172256964194e-06, + "loss": 4.938, + "step": 15825 + }, + { + "epoch": 0.875069098949696, + "grad_norm": 3.5146822929382324, + "learning_rate": 6.424104604889142e-06, + "loss": 4.8737, + "step": 15830 + }, + { + "epoch": 0.8753454947484798, + "grad_norm": 2.9423630237579346, + "learning_rate": 6.409891984081864e-06, + "loss": 4.7986, + "step": 15835 + }, + { + "epoch": 0.8756218905472637, + "grad_norm": 4.098637580871582, + "learning_rate": 6.395679363274589e-06, + "loss": 4.7571, + "step": 15840 + }, + { + "epoch": 0.8758982863460475, + "grad_norm": 2.9004056453704834, + "learning_rate": 6.381466742467311e-06, + "loss": 4.716, + "step": 15845 + }, + { + "epoch": 0.8761746821448314, + "grad_norm": 3.797128915786743, + "learning_rate": 6.367254121660035e-06, + "loss": 5.0502, + "step": 15850 + }, + { + "epoch": 0.8764510779436152, + "grad_norm": 3.994654417037964, + "learning_rate": 6.353041500852757e-06, + "loss": 4.898, + "step": 15855 + }, + { + "epoch": 0.8767274737423991, + "grad_norm": 3.2940444946289062, + "learning_rate": 6.3388288800454805e-06, + "loss": 4.8664, + "step": 15860 + }, + { + "epoch": 0.8770038695411829, + "grad_norm": 4.2146124839782715, + "learning_rate": 6.3246162592382045e-06, + "loss": 4.8941, + "step": 15865 + }, + { + "epoch": 0.8772802653399668, + "grad_norm": 3.3862946033477783, + "learning_rate": 6.310403638430927e-06, + "loss": 4.8247, + "step": 15870 + }, + { + "epoch": 0.8775566611387506, + "grad_norm": 3.480405807495117, + "learning_rate": 6.29619101762365e-06, + "loss": 5.1447, + "step": 15875 + }, + { + "epoch": 0.8778330569375346, + "grad_norm": 3.4733119010925293, + "learning_rate": 6.281978396816374e-06, + "loss": 4.6635, + "step": 15880 + }, + { + "epoch": 0.8781094527363185, + "grad_norm": 3.853881359100342, + "learning_rate": 6.267765776009096e-06, + "loss": 5.0099, + "step": 15885 + }, + { + "epoch": 0.8783858485351023, + "grad_norm": 3.801287889480591, + "learning_rate": 6.253553155201819e-06, + "loss": 4.6277, + "step": 15890 + }, + { + "epoch": 0.8786622443338862, + "grad_norm": 3.785771131515503, + "learning_rate": 6.239340534394543e-06, + "loss": 4.7045, + "step": 15895 + }, + { + "epoch": 0.87893864013267, + "grad_norm": 4.128140449523926, + "learning_rate": 6.2251279135872656e-06, + "loss": 5.0229, + "step": 15900 + }, + { + "epoch": 0.8792150359314539, + "grad_norm": 3.9363701343536377, + "learning_rate": 6.210915292779989e-06, + "loss": 5.1252, + "step": 15905 + }, + { + "epoch": 0.8794914317302377, + "grad_norm": 3.420142889022827, + "learning_rate": 6.196702671972713e-06, + "loss": 5.1304, + "step": 15910 + }, + { + "epoch": 0.8797678275290216, + "grad_norm": 4.293994426727295, + "learning_rate": 6.182490051165435e-06, + "loss": 5.1745, + "step": 15915 + }, + { + "epoch": 0.8800442233278054, + "grad_norm": 4.5453314781188965, + "learning_rate": 6.168277430358158e-06, + "loss": 4.7134, + "step": 15920 + }, + { + "epoch": 0.8803206191265893, + "grad_norm": 3.731394052505493, + "learning_rate": 6.154064809550881e-06, + "loss": 4.7987, + "step": 15925 + }, + { + "epoch": 0.8805970149253731, + "grad_norm": 3.9155333042144775, + "learning_rate": 6.139852188743605e-06, + "loss": 4.8764, + "step": 15930 + }, + { + "epoch": 0.880873410724157, + "grad_norm": 3.251206636428833, + "learning_rate": 6.1256395679363275e-06, + "loss": 4.9742, + "step": 15935 + }, + { + "epoch": 0.8811498065229408, + "grad_norm": 3.428173542022705, + "learning_rate": 6.111426947129051e-06, + "loss": 5.0398, + "step": 15940 + }, + { + "epoch": 0.8814262023217247, + "grad_norm": 4.512082576751709, + "learning_rate": 6.097214326321774e-06, + "loss": 4.9136, + "step": 15945 + }, + { + "epoch": 0.8817025981205086, + "grad_norm": 3.4978713989257812, + "learning_rate": 6.083001705514498e-06, + "loss": 4.732, + "step": 15950 + }, + { + "epoch": 0.8819789939192925, + "grad_norm": 4.127050876617432, + "learning_rate": 6.06878908470722e-06, + "loss": 4.9104, + "step": 15955 + }, + { + "epoch": 0.8822553897180763, + "grad_norm": 3.6599040031433105, + "learning_rate": 6.054576463899943e-06, + "loss": 5.2423, + "step": 15960 + }, + { + "epoch": 0.8825317855168602, + "grad_norm": 3.895284652709961, + "learning_rate": 6.040363843092667e-06, + "loss": 4.9198, + "step": 15965 + }, + { + "epoch": 0.882808181315644, + "grad_norm": 3.3798611164093018, + "learning_rate": 6.026151222285389e-06, + "loss": 4.7896, + "step": 15970 + }, + { + "epoch": 0.8830845771144279, + "grad_norm": 3.8356308937072754, + "learning_rate": 6.0119386014781126e-06, + "loss": 5.1028, + "step": 15975 + }, + { + "epoch": 0.8833609729132117, + "grad_norm": 3.9979400634765625, + "learning_rate": 5.997725980670836e-06, + "loss": 4.9204, + "step": 15980 + }, + { + "epoch": 0.8836373687119956, + "grad_norm": 3.993461847305298, + "learning_rate": 5.98351335986356e-06, + "loss": 4.7276, + "step": 15985 + }, + { + "epoch": 0.8839137645107794, + "grad_norm": 3.9294278621673584, + "learning_rate": 5.969300739056282e-06, + "loss": 4.666, + "step": 15990 + }, + { + "epoch": 0.8841901603095633, + "grad_norm": 5.005484104156494, + "learning_rate": 5.955088118249005e-06, + "loss": 4.9111, + "step": 15995 + }, + { + "epoch": 0.8844665561083471, + "grad_norm": 3.216432809829712, + "learning_rate": 5.940875497441729e-06, + "loss": 4.6319, + "step": 16000 + }, + { + "epoch": 0.884742951907131, + "grad_norm": 3.6470518112182617, + "learning_rate": 5.926662876634452e-06, + "loss": 4.7934, + "step": 16005 + }, + { + "epoch": 0.8850193477059148, + "grad_norm": 3.7931535243988037, + "learning_rate": 5.9124502558271745e-06, + "loss": 4.9271, + "step": 16010 + }, + { + "epoch": 0.8852957435046988, + "grad_norm": 3.077409267425537, + "learning_rate": 5.898237635019898e-06, + "loss": 4.7351, + "step": 16015 + }, + { + "epoch": 0.8855721393034826, + "grad_norm": 3.6445906162261963, + "learning_rate": 5.884025014212622e-06, + "loss": 4.8911, + "step": 16020 + }, + { + "epoch": 0.8858485351022665, + "grad_norm": 4.014408111572266, + "learning_rate": 5.869812393405344e-06, + "loss": 4.9776, + "step": 16025 + }, + { + "epoch": 0.8861249309010503, + "grad_norm": 3.4272587299346924, + "learning_rate": 5.855599772598067e-06, + "loss": 4.9316, + "step": 16030 + }, + { + "epoch": 0.8864013266998342, + "grad_norm": 3.4246294498443604, + "learning_rate": 5.841387151790791e-06, + "loss": 4.7851, + "step": 16035 + }, + { + "epoch": 0.886677722498618, + "grad_norm": 3.426506996154785, + "learning_rate": 5.827174530983514e-06, + "loss": 5.0848, + "step": 16040 + }, + { + "epoch": 0.8869541182974019, + "grad_norm": 4.0810956954956055, + "learning_rate": 5.812961910176236e-06, + "loss": 4.5215, + "step": 16045 + }, + { + "epoch": 0.8872305140961857, + "grad_norm": 2.9384403228759766, + "learning_rate": 5.7987492893689595e-06, + "loss": 4.7552, + "step": 16050 + }, + { + "epoch": 0.8875069098949696, + "grad_norm": 3.227759838104248, + "learning_rate": 5.7845366685616835e-06, + "loss": 4.7817, + "step": 16055 + }, + { + "epoch": 0.8877833056937534, + "grad_norm": 4.257302761077881, + "learning_rate": 5.770324047754406e-06, + "loss": 4.7766, + "step": 16060 + }, + { + "epoch": 0.8880597014925373, + "grad_norm": 4.3714919090271, + "learning_rate": 5.756111426947129e-06, + "loss": 5.0406, + "step": 16065 + }, + { + "epoch": 0.8883360972913211, + "grad_norm": 3.33453369140625, + "learning_rate": 5.741898806139853e-06, + "loss": 5.4695, + "step": 16070 + }, + { + "epoch": 0.888612493090105, + "grad_norm": 3.6694791316986084, + "learning_rate": 5.727686185332576e-06, + "loss": 4.7901, + "step": 16075 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 3.3092100620269775, + "learning_rate": 5.713473564525298e-06, + "loss": 5.0055, + "step": 16080 + }, + { + "epoch": 0.8891652846876728, + "grad_norm": 3.254375457763672, + "learning_rate": 5.6992609437180215e-06, + "loss": 4.8603, + "step": 16085 + }, + { + "epoch": 0.8894416804864566, + "grad_norm": 3.5926568508148193, + "learning_rate": 5.6850483229107454e-06, + "loss": 4.9635, + "step": 16090 + }, + { + "epoch": 0.8897180762852405, + "grad_norm": 3.4119133949279785, + "learning_rate": 5.6708357021034686e-06, + "loss": 5.0395, + "step": 16095 + }, + { + "epoch": 0.8899944720840243, + "grad_norm": 3.3834893703460693, + "learning_rate": 5.656623081296191e-06, + "loss": 5.145, + "step": 16100 + }, + { + "epoch": 0.8902708678828082, + "grad_norm": 3.796090602874756, + "learning_rate": 5.642410460488915e-06, + "loss": 4.8246, + "step": 16105 + }, + { + "epoch": 0.8905472636815921, + "grad_norm": 3.5259482860565186, + "learning_rate": 5.628197839681638e-06, + "loss": 4.8869, + "step": 16110 + }, + { + "epoch": 0.8908236594803759, + "grad_norm": 4.100551605224609, + "learning_rate": 5.61398521887436e-06, + "loss": 4.674, + "step": 16115 + }, + { + "epoch": 0.8911000552791598, + "grad_norm": 4.206357479095459, + "learning_rate": 5.599772598067083e-06, + "loss": 4.7143, + "step": 16120 + }, + { + "epoch": 0.8913764510779436, + "grad_norm": 3.823300838470459, + "learning_rate": 5.585559977259807e-06, + "loss": 4.8694, + "step": 16125 + }, + { + "epoch": 0.8916528468767275, + "grad_norm": 4.283262729644775, + "learning_rate": 5.5713473564525305e-06, + "loss": 5.0432, + "step": 16130 + }, + { + "epoch": 0.8919292426755113, + "grad_norm": 4.096394062042236, + "learning_rate": 5.557134735645253e-06, + "loss": 4.9027, + "step": 16135 + }, + { + "epoch": 0.8922056384742952, + "grad_norm": 3.4035484790802, + "learning_rate": 5.542922114837976e-06, + "loss": 4.8498, + "step": 16140 + }, + { + "epoch": 0.892482034273079, + "grad_norm": 3.974168539047241, + "learning_rate": 5.5287094940307e-06, + "loss": 4.9876, + "step": 16145 + }, + { + "epoch": 0.892758430071863, + "grad_norm": 4.489919662475586, + "learning_rate": 5.514496873223423e-06, + "loss": 4.757, + "step": 16150 + }, + { + "epoch": 0.8930348258706468, + "grad_norm": 3.5792441368103027, + "learning_rate": 5.500284252416145e-06, + "loss": 4.821, + "step": 16155 + }, + { + "epoch": 0.8933112216694307, + "grad_norm": 4.130000591278076, + "learning_rate": 5.486071631608869e-06, + "loss": 4.5594, + "step": 16160 + }, + { + "epoch": 0.8935876174682145, + "grad_norm": 4.009551048278809, + "learning_rate": 5.471859010801592e-06, + "loss": 5.024, + "step": 16165 + }, + { + "epoch": 0.8938640132669984, + "grad_norm": 4.058741569519043, + "learning_rate": 5.457646389994315e-06, + "loss": 5.031, + "step": 16170 + }, + { + "epoch": 0.8941404090657822, + "grad_norm": 3.8095996379852295, + "learning_rate": 5.443433769187038e-06, + "loss": 4.7272, + "step": 16175 + }, + { + "epoch": 0.8944168048645661, + "grad_norm": 3.7943928241729736, + "learning_rate": 5.429221148379762e-06, + "loss": 4.8543, + "step": 16180 + }, + { + "epoch": 0.8946932006633499, + "grad_norm": 3.153428316116333, + "learning_rate": 5.415008527572485e-06, + "loss": 4.6109, + "step": 16185 + }, + { + "epoch": 0.8949695964621338, + "grad_norm": 3.685887575149536, + "learning_rate": 5.400795906765207e-06, + "loss": 4.676, + "step": 16190 + }, + { + "epoch": 0.8952459922609176, + "grad_norm": 3.823957920074463, + "learning_rate": 5.386583285957931e-06, + "loss": 5.0588, + "step": 16195 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 3.5146028995513916, + "learning_rate": 5.372370665150654e-06, + "loss": 4.5432, + "step": 16200 + }, + { + "epoch": 0.8957987838584853, + "grad_norm": 3.8643455505371094, + "learning_rate": 5.3581580443433775e-06, + "loss": 5.029, + "step": 16205 + }, + { + "epoch": 0.8960751796572692, + "grad_norm": 4.048851490020752, + "learning_rate": 5.3439454235361e-06, + "loss": 4.7677, + "step": 16210 + }, + { + "epoch": 0.896351575456053, + "grad_norm": 3.798962354660034, + "learning_rate": 5.329732802728824e-06, + "loss": 5.0452, + "step": 16215 + }, + { + "epoch": 0.896627971254837, + "grad_norm": 3.9125924110412598, + "learning_rate": 5.315520181921547e-06, + "loss": 4.8263, + "step": 16220 + }, + { + "epoch": 0.8969043670536208, + "grad_norm": 5.244776248931885, + "learning_rate": 5.301307561114269e-06, + "loss": 5.1466, + "step": 16225 + }, + { + "epoch": 0.8971807628524047, + "grad_norm": 4.3974528312683105, + "learning_rate": 5.287094940306993e-06, + "loss": 4.6972, + "step": 16230 + }, + { + "epoch": 0.8974571586511885, + "grad_norm": 3.663856267929077, + "learning_rate": 5.272882319499716e-06, + "loss": 4.8124, + "step": 16235 + }, + { + "epoch": 0.8977335544499724, + "grad_norm": 3.177274465560913, + "learning_rate": 5.258669698692439e-06, + "loss": 4.7413, + "step": 16240 + }, + { + "epoch": 0.8980099502487562, + "grad_norm": 4.527139663696289, + "learning_rate": 5.244457077885162e-06, + "loss": 4.6418, + "step": 16245 + }, + { + "epoch": 0.8982863460475401, + "grad_norm": 3.90124773979187, + "learning_rate": 5.230244457077886e-06, + "loss": 4.985, + "step": 16250 + }, + { + "epoch": 0.8985627418463239, + "grad_norm": 4.100987911224365, + "learning_rate": 5.216031836270609e-06, + "loss": 4.9227, + "step": 16255 + }, + { + "epoch": 0.8988391376451078, + "grad_norm": 3.7627832889556885, + "learning_rate": 5.201819215463331e-06, + "loss": 4.658, + "step": 16260 + }, + { + "epoch": 0.8991155334438916, + "grad_norm": 3.8064067363739014, + "learning_rate": 5.187606594656055e-06, + "loss": 4.7242, + "step": 16265 + }, + { + "epoch": 0.8993919292426755, + "grad_norm": 4.380611419677734, + "learning_rate": 5.173393973848778e-06, + "loss": 4.957, + "step": 16270 + }, + { + "epoch": 0.8996683250414593, + "grad_norm": 4.4961676597595215, + "learning_rate": 5.159181353041501e-06, + "loss": 4.7186, + "step": 16275 + }, + { + "epoch": 0.8999447208402432, + "grad_norm": 4.06073522567749, + "learning_rate": 5.144968732234224e-06, + "loss": 5.024, + "step": 16280 + }, + { + "epoch": 0.900221116639027, + "grad_norm": 3.396531581878662, + "learning_rate": 5.130756111426948e-06, + "loss": 4.8516, + "step": 16285 + }, + { + "epoch": 0.900497512437811, + "grad_norm": 3.7098519802093506, + "learning_rate": 5.116543490619671e-06, + "loss": 4.719, + "step": 16290 + }, + { + "epoch": 0.9007739082365948, + "grad_norm": 3.9446892738342285, + "learning_rate": 5.102330869812394e-06, + "loss": 5.1406, + "step": 16295 + }, + { + "epoch": 0.9010503040353787, + "grad_norm": 3.210261344909668, + "learning_rate": 5.088118249005117e-06, + "loss": 4.6729, + "step": 16300 + }, + { + "epoch": 0.9013266998341625, + "grad_norm": 4.007756233215332, + "learning_rate": 5.07390562819784e-06, + "loss": 4.7955, + "step": 16305 + }, + { + "epoch": 0.9016030956329464, + "grad_norm": 3.227644205093384, + "learning_rate": 5.059693007390563e-06, + "loss": 4.7183, + "step": 16310 + }, + { + "epoch": 0.9018794914317302, + "grad_norm": 4.634718894958496, + "learning_rate": 5.0454803865832855e-06, + "loss": 5.101, + "step": 16315 + }, + { + "epoch": 0.9021558872305141, + "grad_norm": 4.174224853515625, + "learning_rate": 5.0312677657760095e-06, + "loss": 4.9855, + "step": 16320 + }, + { + "epoch": 0.9024322830292979, + "grad_norm": 3.7757952213287354, + "learning_rate": 5.017055144968733e-06, + "loss": 5.0235, + "step": 16325 + }, + { + "epoch": 0.9027086788280818, + "grad_norm": 3.770765781402588, + "learning_rate": 5.002842524161456e-06, + "loss": 5.1194, + "step": 16330 + }, + { + "epoch": 0.9029850746268657, + "grad_norm": 3.6249911785125732, + "learning_rate": 4.988629903354178e-06, + "loss": 4.7802, + "step": 16335 + }, + { + "epoch": 0.9032614704256495, + "grad_norm": 4.837128162384033, + "learning_rate": 4.974417282546902e-06, + "loss": 4.599, + "step": 16340 + }, + { + "epoch": 0.9035378662244334, + "grad_norm": 4.374870777130127, + "learning_rate": 4.960204661739625e-06, + "loss": 4.8379, + "step": 16345 + }, + { + "epoch": 0.9038142620232172, + "grad_norm": 4.0973734855651855, + "learning_rate": 4.945992040932348e-06, + "loss": 5.0881, + "step": 16350 + }, + { + "epoch": 0.9040906578220012, + "grad_norm": 4.1243977546691895, + "learning_rate": 4.9317794201250714e-06, + "loss": 5.1151, + "step": 16355 + }, + { + "epoch": 0.904367053620785, + "grad_norm": 3.1973705291748047, + "learning_rate": 4.9175667993177946e-06, + "loss": 5.0274, + "step": 16360 + }, + { + "epoch": 0.9046434494195689, + "grad_norm": 3.458981990814209, + "learning_rate": 4.903354178510518e-06, + "loss": 4.8251, + "step": 16365 + }, + { + "epoch": 0.9049198452183527, + "grad_norm": 3.640015125274658, + "learning_rate": 4.88914155770324e-06, + "loss": 4.9673, + "step": 16370 + }, + { + "epoch": 0.9051962410171366, + "grad_norm": 3.2531189918518066, + "learning_rate": 4.874928936895964e-06, + "loss": 4.5497, + "step": 16375 + }, + { + "epoch": 0.9054726368159204, + "grad_norm": 3.353008508682251, + "learning_rate": 4.860716316088687e-06, + "loss": 4.8228, + "step": 16380 + }, + { + "epoch": 0.9057490326147043, + "grad_norm": 4.130444049835205, + "learning_rate": 4.84650369528141e-06, + "loss": 4.7594, + "step": 16385 + }, + { + "epoch": 0.9060254284134881, + "grad_norm": 3.1232221126556396, + "learning_rate": 4.832291074474133e-06, + "loss": 4.9349, + "step": 16390 + }, + { + "epoch": 0.906301824212272, + "grad_norm": 3.7138428688049316, + "learning_rate": 4.8180784536668565e-06, + "loss": 4.7187, + "step": 16395 + }, + { + "epoch": 0.9065782200110558, + "grad_norm": 3.310006618499756, + "learning_rate": 4.80386583285958e-06, + "loss": 5.0194, + "step": 16400 + }, + { + "epoch": 0.9068546158098397, + "grad_norm": 5.019920349121094, + "learning_rate": 4.789653212052303e-06, + "loss": 4.9197, + "step": 16405 + }, + { + "epoch": 0.9071310116086235, + "grad_norm": 3.792968273162842, + "learning_rate": 4.775440591245026e-06, + "loss": 4.9581, + "step": 16410 + }, + { + "epoch": 0.9074074074074074, + "grad_norm": 4.522578716278076, + "learning_rate": 4.761227970437749e-06, + "loss": 4.7076, + "step": 16415 + }, + { + "epoch": 0.9076838032061912, + "grad_norm": 3.9118542671203613, + "learning_rate": 4.747015349630472e-06, + "loss": 4.9707, + "step": 16420 + }, + { + "epoch": 0.9079601990049752, + "grad_norm": 4.168303489685059, + "learning_rate": 4.732802728823195e-06, + "loss": 4.603, + "step": 16425 + }, + { + "epoch": 0.908236594803759, + "grad_norm": 3.641812324523926, + "learning_rate": 4.7185901080159184e-06, + "loss": 4.8405, + "step": 16430 + }, + { + "epoch": 0.9085129906025429, + "grad_norm": 4.432767868041992, + "learning_rate": 4.7043774872086416e-06, + "loss": 5.1405, + "step": 16435 + }, + { + "epoch": 0.9087893864013267, + "grad_norm": 3.7158989906311035, + "learning_rate": 4.690164866401365e-06, + "loss": 4.8914, + "step": 16440 + }, + { + "epoch": 0.9090657822001106, + "grad_norm": 3.8096296787261963, + "learning_rate": 4.675952245594088e-06, + "loss": 4.8564, + "step": 16445 + }, + { + "epoch": 0.9093421779988944, + "grad_norm": 3.3491950035095215, + "learning_rate": 4.661739624786811e-06, + "loss": 5.1232, + "step": 16450 + }, + { + "epoch": 0.9096185737976783, + "grad_norm": 4.407762050628662, + "learning_rate": 4.647527003979534e-06, + "loss": 5.2513, + "step": 16455 + }, + { + "epoch": 0.9098949695964621, + "grad_norm": 4.356717109680176, + "learning_rate": 4.633314383172257e-06, + "loss": 5.0331, + "step": 16460 + }, + { + "epoch": 0.910171365395246, + "grad_norm": 3.2973270416259766, + "learning_rate": 4.61910176236498e-06, + "loss": 4.8999, + "step": 16465 + }, + { + "epoch": 0.9104477611940298, + "grad_norm": 3.944094181060791, + "learning_rate": 4.6048891415577035e-06, + "loss": 4.7392, + "step": 16470 + }, + { + "epoch": 0.9107241569928137, + "grad_norm": 3.541595220565796, + "learning_rate": 4.590676520750427e-06, + "loss": 5.0021, + "step": 16475 + }, + { + "epoch": 0.9110005527915975, + "grad_norm": 4.237240314483643, + "learning_rate": 4.57646389994315e-06, + "loss": 4.586, + "step": 16480 + }, + { + "epoch": 0.9112769485903814, + "grad_norm": 3.664316177368164, + "learning_rate": 4.562251279135873e-06, + "loss": 4.9774, + "step": 16485 + }, + { + "epoch": 0.9115533443891652, + "grad_norm": 3.5280942916870117, + "learning_rate": 4.548038658328596e-06, + "loss": 4.9678, + "step": 16490 + }, + { + "epoch": 0.9118297401879492, + "grad_norm": 3.860072135925293, + "learning_rate": 4.533826037521319e-06, + "loss": 5.1321, + "step": 16495 + }, + { + "epoch": 0.912106135986733, + "grad_norm": 3.7712130546569824, + "learning_rate": 4.519613416714042e-06, + "loss": 4.9678, + "step": 16500 + }, + { + "epoch": 0.9123825317855169, + "grad_norm": 3.747433662414551, + "learning_rate": 4.505400795906765e-06, + "loss": 4.9893, + "step": 16505 + }, + { + "epoch": 0.9126589275843007, + "grad_norm": 6.0445170402526855, + "learning_rate": 4.4911881750994885e-06, + "loss": 4.8376, + "step": 16510 + }, + { + "epoch": 0.9129353233830846, + "grad_norm": 3.5901873111724854, + "learning_rate": 4.476975554292212e-06, + "loss": 4.8362, + "step": 16515 + }, + { + "epoch": 0.9132117191818684, + "grad_norm": 3.6722826957702637, + "learning_rate": 4.462762933484935e-06, + "loss": 5.1034, + "step": 16520 + }, + { + "epoch": 0.9134881149806523, + "grad_norm": 3.5389833450317383, + "learning_rate": 4.448550312677658e-06, + "loss": 4.6817, + "step": 16525 + }, + { + "epoch": 0.9137645107794361, + "grad_norm": 3.544811248779297, + "learning_rate": 4.434337691870381e-06, + "loss": 5.0695, + "step": 16530 + }, + { + "epoch": 0.91404090657822, + "grad_norm": 4.132248401641846, + "learning_rate": 4.420125071063104e-06, + "loss": 4.9197, + "step": 16535 + }, + { + "epoch": 0.9143173023770038, + "grad_norm": 3.680356979370117, + "learning_rate": 4.405912450255827e-06, + "loss": 4.5352, + "step": 16540 + }, + { + "epoch": 0.9145936981757877, + "grad_norm": 4.035909175872803, + "learning_rate": 4.3916998294485505e-06, + "loss": 4.8975, + "step": 16545 + }, + { + "epoch": 0.9148700939745716, + "grad_norm": 3.8951597213745117, + "learning_rate": 4.377487208641274e-06, + "loss": 4.817, + "step": 16550 + }, + { + "epoch": 0.9151464897733554, + "grad_norm": 3.9107933044433594, + "learning_rate": 4.363274587833997e-06, + "loss": 4.7651, + "step": 16555 + }, + { + "epoch": 0.9154228855721394, + "grad_norm": 3.4493396282196045, + "learning_rate": 4.34906196702672e-06, + "loss": 5.076, + "step": 16560 + }, + { + "epoch": 0.9156992813709232, + "grad_norm": 4.296689987182617, + "learning_rate": 4.334849346219443e-06, + "loss": 4.9039, + "step": 16565 + }, + { + "epoch": 0.9159756771697071, + "grad_norm": 3.8431396484375, + "learning_rate": 4.320636725412166e-06, + "loss": 4.6727, + "step": 16570 + }, + { + "epoch": 0.9162520729684909, + "grad_norm": 4.500297546386719, + "learning_rate": 4.306424104604889e-06, + "loss": 5.3912, + "step": 16575 + }, + { + "epoch": 0.9165284687672748, + "grad_norm": 4.726571083068848, + "learning_rate": 4.292211483797612e-06, + "loss": 5.2984, + "step": 16580 + }, + { + "epoch": 0.9168048645660586, + "grad_norm": 3.416921615600586, + "learning_rate": 4.2779988629903355e-06, + "loss": 4.6175, + "step": 16585 + }, + { + "epoch": 0.9170812603648425, + "grad_norm": 3.1644046306610107, + "learning_rate": 4.263786242183059e-06, + "loss": 4.6776, + "step": 16590 + }, + { + "epoch": 0.9173576561636263, + "grad_norm": 3.82157826423645, + "learning_rate": 4.249573621375782e-06, + "loss": 4.881, + "step": 16595 + }, + { + "epoch": 0.9176340519624102, + "grad_norm": 3.8752880096435547, + "learning_rate": 4.235361000568505e-06, + "loss": 4.749, + "step": 16600 + }, + { + "epoch": 0.917910447761194, + "grad_norm": 3.842712879180908, + "learning_rate": 4.221148379761229e-06, + "loss": 4.7253, + "step": 16605 + }, + { + "epoch": 0.9181868435599779, + "grad_norm": 4.872412204742432, + "learning_rate": 4.206935758953951e-06, + "loss": 4.7417, + "step": 16610 + }, + { + "epoch": 0.9184632393587617, + "grad_norm": 3.414172649383545, + "learning_rate": 4.192723138146674e-06, + "loss": 5.1289, + "step": 16615 + }, + { + "epoch": 0.9187396351575456, + "grad_norm": 4.3799543380737305, + "learning_rate": 4.1785105173393974e-06, + "loss": 4.9575, + "step": 16620 + }, + { + "epoch": 0.9190160309563294, + "grad_norm": 3.8672125339508057, + "learning_rate": 4.164297896532121e-06, + "loss": 4.7991, + "step": 16625 + }, + { + "epoch": 0.9192924267551134, + "grad_norm": 3.75191593170166, + "learning_rate": 4.150085275724844e-06, + "loss": 4.8514, + "step": 16630 + }, + { + "epoch": 0.9195688225538972, + "grad_norm": 3.8761277198791504, + "learning_rate": 4.135872654917567e-06, + "loss": 4.9027, + "step": 16635 + }, + { + "epoch": 0.9198452183526811, + "grad_norm": 3.8116533756256104, + "learning_rate": 4.12166003411029e-06, + "loss": 4.7569, + "step": 16640 + }, + { + "epoch": 0.9201216141514649, + "grad_norm": 3.882634162902832, + "learning_rate": 4.107447413303013e-06, + "loss": 4.6937, + "step": 16645 + }, + { + "epoch": 0.9203980099502488, + "grad_norm": 3.3166961669921875, + "learning_rate": 4.093234792495736e-06, + "loss": 4.8617, + "step": 16650 + }, + { + "epoch": 0.9206744057490326, + "grad_norm": 4.038979530334473, + "learning_rate": 4.079022171688459e-06, + "loss": 4.9206, + "step": 16655 + }, + { + "epoch": 0.9209508015478165, + "grad_norm": 3.362423896789551, + "learning_rate": 4.064809550881183e-06, + "loss": 4.9875, + "step": 16660 + }, + { + "epoch": 0.9212271973466003, + "grad_norm": 3.3738491535186768, + "learning_rate": 4.050596930073906e-06, + "loss": 4.7933, + "step": 16665 + }, + { + "epoch": 0.9215035931453842, + "grad_norm": 3.503601551055908, + "learning_rate": 4.036384309266629e-06, + "loss": 5.0732, + "step": 16670 + }, + { + "epoch": 0.921779988944168, + "grad_norm": 3.1450142860412598, + "learning_rate": 4.022171688459352e-06, + "loss": 4.7811, + "step": 16675 + }, + { + "epoch": 0.9220563847429519, + "grad_norm": 3.4202752113342285, + "learning_rate": 4.007959067652075e-06, + "loss": 5.0072, + "step": 16680 + }, + { + "epoch": 0.9223327805417357, + "grad_norm": 3.7823784351348877, + "learning_rate": 3.993746446844798e-06, + "loss": 5.1254, + "step": 16685 + }, + { + "epoch": 0.9226091763405196, + "grad_norm": 4.299469947814941, + "learning_rate": 3.979533826037521e-06, + "loss": 4.9729, + "step": 16690 + }, + { + "epoch": 0.9228855721393034, + "grad_norm": 3.722261905670166, + "learning_rate": 3.965321205230245e-06, + "loss": 4.9265, + "step": 16695 + }, + { + "epoch": 0.9231619679380874, + "grad_norm": 4.488846778869629, + "learning_rate": 3.9511085844229676e-06, + "loss": 4.8332, + "step": 16700 + }, + { + "epoch": 0.9234383637368712, + "grad_norm": 3.4476404190063477, + "learning_rate": 3.936895963615691e-06, + "loss": 4.8358, + "step": 16705 + }, + { + "epoch": 0.9237147595356551, + "grad_norm": 3.9329166412353516, + "learning_rate": 3.922683342808414e-06, + "loss": 4.721, + "step": 16710 + }, + { + "epoch": 0.9239911553344389, + "grad_norm": 4.05313777923584, + "learning_rate": 3.908470722001138e-06, + "loss": 4.9849, + "step": 16715 + }, + { + "epoch": 0.9242675511332228, + "grad_norm": 3.66392183303833, + "learning_rate": 3.89425810119386e-06, + "loss": 4.8486, + "step": 16720 + }, + { + "epoch": 0.9245439469320066, + "grad_norm": 4.335777759552002, + "learning_rate": 3.880045480386583e-06, + "loss": 4.9046, + "step": 16725 + }, + { + "epoch": 0.9248203427307905, + "grad_norm": 4.8382954597473145, + "learning_rate": 3.865832859579307e-06, + "loss": 4.8476, + "step": 16730 + }, + { + "epoch": 0.9250967385295743, + "grad_norm": 3.354020357131958, + "learning_rate": 3.8516202387720295e-06, + "loss": 5.12, + "step": 16735 + }, + { + "epoch": 0.9253731343283582, + "grad_norm": 3.5212149620056152, + "learning_rate": 3.837407617964753e-06, + "loss": 4.6406, + "step": 16740 + }, + { + "epoch": 0.925649530127142, + "grad_norm": 4.447071552276611, + "learning_rate": 3.823194997157476e-06, + "loss": 5.1303, + "step": 16745 + }, + { + "epoch": 0.9259259259259259, + "grad_norm": 4.714621067047119, + "learning_rate": 3.8089823763501993e-06, + "loss": 5.0047, + "step": 16750 + }, + { + "epoch": 0.9262023217247097, + "grad_norm": 4.3438568115234375, + "learning_rate": 3.794769755542922e-06, + "loss": 4.8138, + "step": 16755 + }, + { + "epoch": 0.9264787175234936, + "grad_norm": 3.487215280532837, + "learning_rate": 3.7805571347356456e-06, + "loss": 5.0629, + "step": 16760 + }, + { + "epoch": 0.9267551133222774, + "grad_norm": 3.3207781314849854, + "learning_rate": 3.7663445139283687e-06, + "loss": 4.6624, + "step": 16765 + }, + { + "epoch": 0.9270315091210614, + "grad_norm": 3.6874585151672363, + "learning_rate": 3.7521318931210914e-06, + "loss": 4.9265, + "step": 16770 + }, + { + "epoch": 0.9273079049198453, + "grad_norm": 3.417487859725952, + "learning_rate": 3.7379192723138145e-06, + "loss": 4.8298, + "step": 16775 + }, + { + "epoch": 0.9275843007186291, + "grad_norm": 3.985056161880493, + "learning_rate": 3.723706651506538e-06, + "loss": 5.0497, + "step": 16780 + }, + { + "epoch": 0.927860696517413, + "grad_norm": 3.5535130500793457, + "learning_rate": 3.7094940306992612e-06, + "loss": 4.9327, + "step": 16785 + }, + { + "epoch": 0.9281370923161968, + "grad_norm": 3.7775490283966064, + "learning_rate": 3.695281409891984e-06, + "loss": 4.7599, + "step": 16790 + }, + { + "epoch": 0.9284134881149807, + "grad_norm": 3.6912879943847656, + "learning_rate": 3.6810687890847075e-06, + "loss": 4.73, + "step": 16795 + }, + { + "epoch": 0.9286898839137645, + "grad_norm": 3.8959012031555176, + "learning_rate": 3.6668561682774306e-06, + "loss": 4.6683, + "step": 16800 + }, + { + "epoch": 0.9289662797125484, + "grad_norm": 3.763211250305176, + "learning_rate": 3.6526435474701538e-06, + "loss": 4.9404, + "step": 16805 + }, + { + "epoch": 0.9292426755113322, + "grad_norm": 2.960988998413086, + "learning_rate": 3.6384309266628765e-06, + "loss": 4.7549, + "step": 16810 + }, + { + "epoch": 0.9295190713101161, + "grad_norm": 3.7784411907196045, + "learning_rate": 3.6242183058556e-06, + "loss": 5.0343, + "step": 16815 + }, + { + "epoch": 0.9297954671088999, + "grad_norm": 3.5923070907592773, + "learning_rate": 3.610005685048323e-06, + "loss": 4.9945, + "step": 16820 + }, + { + "epoch": 0.9300718629076838, + "grad_norm": 4.040002822875977, + "learning_rate": 3.595793064241046e-06, + "loss": 4.9812, + "step": 16825 + }, + { + "epoch": 0.9303482587064676, + "grad_norm": 4.1851806640625, + "learning_rate": 3.5815804434337694e-06, + "loss": 5.0198, + "step": 16830 + }, + { + "epoch": 0.9306246545052516, + "grad_norm": 3.2891855239868164, + "learning_rate": 3.5673678226264926e-06, + "loss": 4.9279, + "step": 16835 + }, + { + "epoch": 0.9309010503040354, + "grad_norm": 4.148435115814209, + "learning_rate": 3.5531552018192157e-06, + "loss": 4.7171, + "step": 16840 + }, + { + "epoch": 0.9311774461028193, + "grad_norm": 3.529670238494873, + "learning_rate": 3.5389425810119384e-06, + "loss": 5.2854, + "step": 16845 + }, + { + "epoch": 0.9314538419016031, + "grad_norm": 4.003045082092285, + "learning_rate": 3.524729960204662e-06, + "loss": 4.829, + "step": 16850 + }, + { + "epoch": 0.931730237700387, + "grad_norm": 4.24057674407959, + "learning_rate": 3.510517339397385e-06, + "loss": 4.6103, + "step": 16855 + }, + { + "epoch": 0.9320066334991708, + "grad_norm": 4.488296985626221, + "learning_rate": 3.4963047185901086e-06, + "loss": 4.9311, + "step": 16860 + }, + { + "epoch": 0.9322830292979547, + "grad_norm": 3.4120020866394043, + "learning_rate": 3.482092097782831e-06, + "loss": 4.8596, + "step": 16865 + }, + { + "epoch": 0.9325594250967385, + "grad_norm": 3.683335781097412, + "learning_rate": 3.4678794769755545e-06, + "loss": 4.5523, + "step": 16870 + }, + { + "epoch": 0.9328358208955224, + "grad_norm": 3.4997408390045166, + "learning_rate": 3.4536668561682776e-06, + "loss": 4.8484, + "step": 16875 + }, + { + "epoch": 0.9331122166943062, + "grad_norm": 4.172585964202881, + "learning_rate": 3.4394542353610003e-06, + "loss": 4.8826, + "step": 16880 + }, + { + "epoch": 0.9333886124930901, + "grad_norm": 3.2831268310546875, + "learning_rate": 3.425241614553724e-06, + "loss": 4.8996, + "step": 16885 + }, + { + "epoch": 0.9336650082918739, + "grad_norm": 4.524475574493408, + "learning_rate": 3.411028993746447e-06, + "loss": 5.0182, + "step": 16890 + }, + { + "epoch": 0.9339414040906578, + "grad_norm": 3.8012876510620117, + "learning_rate": 3.3968163729391706e-06, + "loss": 4.9137, + "step": 16895 + }, + { + "epoch": 0.9342177998894416, + "grad_norm": 4.124208450317383, + "learning_rate": 3.382603752131893e-06, + "loss": 4.7937, + "step": 16900 + }, + { + "epoch": 0.9344941956882256, + "grad_norm": 3.331831932067871, + "learning_rate": 3.3683911313246164e-06, + "loss": 4.7215, + "step": 16905 + }, + { + "epoch": 0.9347705914870094, + "grad_norm": 4.77909517288208, + "learning_rate": 3.3541785105173395e-06, + "loss": 4.811, + "step": 16910 + }, + { + "epoch": 0.9350469872857933, + "grad_norm": 4.471525192260742, + "learning_rate": 3.339965889710063e-06, + "loss": 4.704, + "step": 16915 + }, + { + "epoch": 0.9353233830845771, + "grad_norm": 3.200888156890869, + "learning_rate": 3.325753268902786e-06, + "loss": 4.5741, + "step": 16920 + }, + { + "epoch": 0.935599778883361, + "grad_norm": 3.5812439918518066, + "learning_rate": 3.311540648095509e-06, + "loss": 4.9324, + "step": 16925 + }, + { + "epoch": 0.9358761746821448, + "grad_norm": 4.601278305053711, + "learning_rate": 3.2973280272882325e-06, + "loss": 4.908, + "step": 16930 + }, + { + "epoch": 0.9361525704809287, + "grad_norm": 3.4622044563293457, + "learning_rate": 3.2831154064809548e-06, + "loss": 4.5155, + "step": 16935 + }, + { + "epoch": 0.9364289662797125, + "grad_norm": 4.049286842346191, + "learning_rate": 3.2689027856736783e-06, + "loss": 4.7219, + "step": 16940 + }, + { + "epoch": 0.9367053620784964, + "grad_norm": 4.182099342346191, + "learning_rate": 3.2546901648664015e-06, + "loss": 4.8228, + "step": 16945 + }, + { + "epoch": 0.9369817578772802, + "grad_norm": 3.8937876224517822, + "learning_rate": 3.240477544059125e-06, + "loss": 4.8272, + "step": 16950 + }, + { + "epoch": 0.9372581536760641, + "grad_norm": 4.102065086364746, + "learning_rate": 3.2262649232518477e-06, + "loss": 5.0221, + "step": 16955 + }, + { + "epoch": 0.9375345494748479, + "grad_norm": 3.8191025257110596, + "learning_rate": 3.212052302444571e-06, + "loss": 4.9027, + "step": 16960 + }, + { + "epoch": 0.9378109452736318, + "grad_norm": 3.0223190784454346, + "learning_rate": 3.1978396816372944e-06, + "loss": 4.7848, + "step": 16965 + }, + { + "epoch": 0.9380873410724156, + "grad_norm": 3.0020852088928223, + "learning_rate": 3.1836270608300175e-06, + "loss": 4.8542, + "step": 16970 + }, + { + "epoch": 0.9383637368711996, + "grad_norm": 3.872197389602661, + "learning_rate": 3.1694144400227403e-06, + "loss": 4.4044, + "step": 16975 + }, + { + "epoch": 0.9386401326699834, + "grad_norm": 4.418979167938232, + "learning_rate": 3.1552018192154634e-06, + "loss": 4.732, + "step": 16980 + }, + { + "epoch": 0.9389165284687673, + "grad_norm": 3.6312499046325684, + "learning_rate": 3.140989198408187e-06, + "loss": 5.0736, + "step": 16985 + }, + { + "epoch": 0.9391929242675512, + "grad_norm": 4.047171592712402, + "learning_rate": 3.1267765776009097e-06, + "loss": 4.8834, + "step": 16990 + }, + { + "epoch": 0.939469320066335, + "grad_norm": 3.8443045616149902, + "learning_rate": 3.1125639567936328e-06, + "loss": 5.0899, + "step": 16995 + }, + { + "epoch": 0.9397457158651189, + "grad_norm": 3.2416129112243652, + "learning_rate": 3.0983513359863563e-06, + "loss": 4.6055, + "step": 17000 + }, + { + "epoch": 0.9400221116639027, + "grad_norm": 4.050750255584717, + "learning_rate": 3.084138715179079e-06, + "loss": 4.6478, + "step": 17005 + }, + { + "epoch": 0.9402985074626866, + "grad_norm": 3.5441677570343018, + "learning_rate": 3.0699260943718026e-06, + "loss": 4.7012, + "step": 17010 + }, + { + "epoch": 0.9405749032614704, + "grad_norm": 3.7427573204040527, + "learning_rate": 3.0557134735645253e-06, + "loss": 4.8786, + "step": 17015 + }, + { + "epoch": 0.9408512990602543, + "grad_norm": 3.412027359008789, + "learning_rate": 3.041500852757249e-06, + "loss": 5.0255, + "step": 17020 + }, + { + "epoch": 0.9411276948590381, + "grad_norm": 4.9815497398376465, + "learning_rate": 3.0272882319499716e-06, + "loss": 4.6702, + "step": 17025 + }, + { + "epoch": 0.941404090657822, + "grad_norm": 4.5840044021606445, + "learning_rate": 3.0130756111426947e-06, + "loss": 4.7252, + "step": 17030 + }, + { + "epoch": 0.9416804864566058, + "grad_norm": 3.795989751815796, + "learning_rate": 2.998862990335418e-06, + "loss": 5.215, + "step": 17035 + }, + { + "epoch": 0.9419568822553898, + "grad_norm": 3.6358675956726074, + "learning_rate": 2.984650369528141e-06, + "loss": 4.6967, + "step": 17040 + }, + { + "epoch": 0.9422332780541736, + "grad_norm": 3.8705015182495117, + "learning_rate": 2.9704377487208645e-06, + "loss": 4.806, + "step": 17045 + }, + { + "epoch": 0.9425096738529575, + "grad_norm": 3.705080270767212, + "learning_rate": 2.9562251279135872e-06, + "loss": 4.7858, + "step": 17050 + }, + { + "epoch": 0.9427860696517413, + "grad_norm": 3.397587299346924, + "learning_rate": 2.942012507106311e-06, + "loss": 4.4487, + "step": 17055 + }, + { + "epoch": 0.9430624654505252, + "grad_norm": 4.504406929016113, + "learning_rate": 2.9277998862990335e-06, + "loss": 5.0349, + "step": 17060 + }, + { + "epoch": 0.943338861249309, + "grad_norm": 4.826185703277588, + "learning_rate": 2.913587265491757e-06, + "loss": 5.0815, + "step": 17065 + }, + { + "epoch": 0.9436152570480929, + "grad_norm": 3.7354743480682373, + "learning_rate": 2.8993746446844798e-06, + "loss": 4.5416, + "step": 17070 + }, + { + "epoch": 0.9438916528468767, + "grad_norm": 3.69808030128479, + "learning_rate": 2.885162023877203e-06, + "loss": 5.134, + "step": 17075 + }, + { + "epoch": 0.9441680486456606, + "grad_norm": 3.6593971252441406, + "learning_rate": 2.8709494030699265e-06, + "loss": 4.9477, + "step": 17080 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 3.8395798206329346, + "learning_rate": 2.856736782262649e-06, + "loss": 4.4886, + "step": 17085 + }, + { + "epoch": 0.9447208402432283, + "grad_norm": 4.302114009857178, + "learning_rate": 2.8425241614553727e-06, + "loss": 4.9254, + "step": 17090 + }, + { + "epoch": 0.9449972360420121, + "grad_norm": 3.074225664138794, + "learning_rate": 2.8283115406480954e-06, + "loss": 4.586, + "step": 17095 + }, + { + "epoch": 0.945273631840796, + "grad_norm": 4.352227687835693, + "learning_rate": 2.814098919840819e-06, + "loss": 4.9557, + "step": 17100 + }, + { + "epoch": 0.9455500276395798, + "grad_norm": 4.241294860839844, + "learning_rate": 2.7998862990335417e-06, + "loss": 5.0972, + "step": 17105 + }, + { + "epoch": 0.9458264234383638, + "grad_norm": 3.86039662361145, + "learning_rate": 2.7856736782262652e-06, + "loss": 4.9244, + "step": 17110 + }, + { + "epoch": 0.9461028192371476, + "grad_norm": 3.4481401443481445, + "learning_rate": 2.771461057418988e-06, + "loss": 4.6124, + "step": 17115 + }, + { + "epoch": 0.9463792150359315, + "grad_norm": 3.4227750301361084, + "learning_rate": 2.7572484366117115e-06, + "loss": 4.7664, + "step": 17120 + }, + { + "epoch": 0.9466556108347153, + "grad_norm": 4.316078186035156, + "learning_rate": 2.7430358158044346e-06, + "loss": 4.8977, + "step": 17125 + }, + { + "epoch": 0.9469320066334992, + "grad_norm": 4.136010646820068, + "learning_rate": 2.7288231949971574e-06, + "loss": 5.169, + "step": 17130 + }, + { + "epoch": 0.947208402432283, + "grad_norm": 3.5034444332122803, + "learning_rate": 2.714610574189881e-06, + "loss": 4.9702, + "step": 17135 + }, + { + "epoch": 0.9474847982310669, + "grad_norm": 3.8330748081207275, + "learning_rate": 2.7003979533826036e-06, + "loss": 5.0539, + "step": 17140 + }, + { + "epoch": 0.9477611940298507, + "grad_norm": 3.778444290161133, + "learning_rate": 2.686185332575327e-06, + "loss": 4.9856, + "step": 17145 + }, + { + "epoch": 0.9480375898286346, + "grad_norm": 4.075733184814453, + "learning_rate": 2.67197271176805e-06, + "loss": 4.4412, + "step": 17150 + }, + { + "epoch": 0.9483139856274184, + "grad_norm": 4.561306953430176, + "learning_rate": 2.6577600909607734e-06, + "loss": 4.5683, + "step": 17155 + }, + { + "epoch": 0.9485903814262023, + "grad_norm": 4.139663219451904, + "learning_rate": 2.6435474701534966e-06, + "loss": 5.1347, + "step": 17160 + }, + { + "epoch": 0.9488667772249861, + "grad_norm": 4.152304172515869, + "learning_rate": 2.6293348493462197e-06, + "loss": 4.851, + "step": 17165 + }, + { + "epoch": 0.94914317302377, + "grad_norm": 3.967447280883789, + "learning_rate": 2.615122228538943e-06, + "loss": 4.9876, + "step": 17170 + }, + { + "epoch": 0.9494195688225538, + "grad_norm": 3.313063144683838, + "learning_rate": 2.6009096077316655e-06, + "loss": 4.8804, + "step": 17175 + }, + { + "epoch": 0.9496959646213378, + "grad_norm": 3.359733819961548, + "learning_rate": 2.586696986924389e-06, + "loss": 4.673, + "step": 17180 + }, + { + "epoch": 0.9499723604201216, + "grad_norm": 3.648066520690918, + "learning_rate": 2.572484366117112e-06, + "loss": 4.9545, + "step": 17185 + }, + { + "epoch": 0.9502487562189055, + "grad_norm": 3.5169296264648438, + "learning_rate": 2.5582717453098354e-06, + "loss": 4.7484, + "step": 17190 + }, + { + "epoch": 0.9505251520176893, + "grad_norm": 3.443011522293091, + "learning_rate": 2.5440591245025585e-06, + "loss": 4.6217, + "step": 17195 + }, + { + "epoch": 0.9508015478164732, + "grad_norm": 3.1068191528320312, + "learning_rate": 2.5298465036952816e-06, + "loss": 4.6914, + "step": 17200 + }, + { + "epoch": 0.951077943615257, + "grad_norm": 3.6582255363464355, + "learning_rate": 2.5156338828880048e-06, + "loss": 4.7874, + "step": 17205 + }, + { + "epoch": 0.9513543394140409, + "grad_norm": 4.528609752655029, + "learning_rate": 2.501421262080728e-06, + "loss": 4.8206, + "step": 17210 + }, + { + "epoch": 0.9516307352128248, + "grad_norm": 4.048614025115967, + "learning_rate": 2.487208641273451e-06, + "loss": 5.1836, + "step": 17215 + }, + { + "epoch": 0.9519071310116086, + "grad_norm": 3.5472965240478516, + "learning_rate": 2.472996020466174e-06, + "loss": 4.8558, + "step": 17220 + }, + { + "epoch": 0.9521835268103925, + "grad_norm": 3.3577089309692383, + "learning_rate": 2.4587833996588973e-06, + "loss": 5.1221, + "step": 17225 + }, + { + "epoch": 0.9524599226091763, + "grad_norm": 3.5389177799224854, + "learning_rate": 2.44457077885162e-06, + "loss": 5.0266, + "step": 17230 + }, + { + "epoch": 0.9527363184079602, + "grad_norm": 4.607699871063232, + "learning_rate": 2.4303581580443436e-06, + "loss": 5.0358, + "step": 17235 + }, + { + "epoch": 0.953012714206744, + "grad_norm": 3.5849125385284424, + "learning_rate": 2.4161455372370667e-06, + "loss": 4.8641, + "step": 17240 + }, + { + "epoch": 0.953289110005528, + "grad_norm": 4.045363426208496, + "learning_rate": 2.40193291642979e-06, + "loss": 4.5956, + "step": 17245 + }, + { + "epoch": 0.9535655058043118, + "grad_norm": 3.181807041168213, + "learning_rate": 2.387720295622513e-06, + "loss": 4.868, + "step": 17250 + }, + { + "epoch": 0.9538419016030957, + "grad_norm": 3.7292158603668213, + "learning_rate": 2.373507674815236e-06, + "loss": 4.9381, + "step": 17255 + }, + { + "epoch": 0.9541182974018795, + "grad_norm": 3.4840378761291504, + "learning_rate": 2.3592950540079592e-06, + "loss": 4.6714, + "step": 17260 + }, + { + "epoch": 0.9543946932006634, + "grad_norm": 4.143215179443359, + "learning_rate": 2.3450824332006823e-06, + "loss": 4.5491, + "step": 17265 + }, + { + "epoch": 0.9546710889994472, + "grad_norm": 4.7000555992126465, + "learning_rate": 2.3308698123934055e-06, + "loss": 4.7206, + "step": 17270 + }, + { + "epoch": 0.9549474847982311, + "grad_norm": 3.5755834579467773, + "learning_rate": 2.3166571915861286e-06, + "loss": 4.6387, + "step": 17275 + }, + { + "epoch": 0.9552238805970149, + "grad_norm": 3.0749599933624268, + "learning_rate": 2.3024445707788517e-06, + "loss": 4.6379, + "step": 17280 + }, + { + "epoch": 0.9555002763957988, + "grad_norm": 4.241413593292236, + "learning_rate": 2.288231949971575e-06, + "loss": 4.7813, + "step": 17285 + }, + { + "epoch": 0.9557766721945826, + "grad_norm": 4.150501251220703, + "learning_rate": 2.274019329164298e-06, + "loss": 5.048, + "step": 17290 + }, + { + "epoch": 0.9560530679933665, + "grad_norm": 3.893385887145996, + "learning_rate": 2.259806708357021e-06, + "loss": 5.1972, + "step": 17295 + }, + { + "epoch": 0.9563294637921503, + "grad_norm": 3.0259926319122314, + "learning_rate": 2.2455940875497443e-06, + "loss": 4.9307, + "step": 17300 + }, + { + "epoch": 0.9566058595909342, + "grad_norm": 4.032433986663818, + "learning_rate": 2.2313814667424674e-06, + "loss": 4.8233, + "step": 17305 + }, + { + "epoch": 0.956882255389718, + "grad_norm": 3.581953525543213, + "learning_rate": 2.2171688459351905e-06, + "loss": 4.994, + "step": 17310 + }, + { + "epoch": 0.957158651188502, + "grad_norm": 3.022449493408203, + "learning_rate": 2.2029562251279137e-06, + "loss": 4.8858, + "step": 17315 + }, + { + "epoch": 0.9574350469872858, + "grad_norm": 3.664137125015259, + "learning_rate": 2.188743604320637e-06, + "loss": 4.8783, + "step": 17320 + }, + { + "epoch": 0.9577114427860697, + "grad_norm": 3.227354049682617, + "learning_rate": 2.17453098351336e-06, + "loss": 4.923, + "step": 17325 + }, + { + "epoch": 0.9579878385848535, + "grad_norm": 3.1704261302948, + "learning_rate": 2.160318362706083e-06, + "loss": 4.8075, + "step": 17330 + }, + { + "epoch": 0.9582642343836374, + "grad_norm": 3.537228584289551, + "learning_rate": 2.146105741898806e-06, + "loss": 4.7424, + "step": 17335 + }, + { + "epoch": 0.9585406301824212, + "grad_norm": 4.334897518157959, + "learning_rate": 2.1318931210915293e-06, + "loss": 4.9451, + "step": 17340 + }, + { + "epoch": 0.9588170259812051, + "grad_norm": 3.4837393760681152, + "learning_rate": 2.1176805002842525e-06, + "loss": 4.8, + "step": 17345 + }, + { + "epoch": 0.9590934217799889, + "grad_norm": 3.615074396133423, + "learning_rate": 2.1034678794769756e-06, + "loss": 4.9119, + "step": 17350 + }, + { + "epoch": 0.9593698175787728, + "grad_norm": 3.716219425201416, + "learning_rate": 2.0892552586696987e-06, + "loss": 5.1262, + "step": 17355 + }, + { + "epoch": 0.9596462133775566, + "grad_norm": 3.8585875034332275, + "learning_rate": 2.075042637862422e-06, + "loss": 4.9943, + "step": 17360 + }, + { + "epoch": 0.9599226091763405, + "grad_norm": 3.1968586444854736, + "learning_rate": 2.060830017055145e-06, + "loss": 4.8486, + "step": 17365 + }, + { + "epoch": 0.9601990049751243, + "grad_norm": 5.157547950744629, + "learning_rate": 2.046617396247868e-06, + "loss": 5.1108, + "step": 17370 + }, + { + "epoch": 0.9604754007739082, + "grad_norm": 4.623779296875, + "learning_rate": 2.0324047754405917e-06, + "loss": 4.89, + "step": 17375 + }, + { + "epoch": 0.960751796572692, + "grad_norm": 3.5606627464294434, + "learning_rate": 2.0181921546333144e-06, + "loss": 4.8832, + "step": 17380 + }, + { + "epoch": 0.961028192371476, + "grad_norm": 3.792771577835083, + "learning_rate": 2.0039795338260375e-06, + "loss": 4.8013, + "step": 17385 + }, + { + "epoch": 0.9613045881702598, + "grad_norm": 3.4608917236328125, + "learning_rate": 1.9897669130187606e-06, + "loss": 4.888, + "step": 17390 + }, + { + "epoch": 0.9615809839690437, + "grad_norm": 4.744699001312256, + "learning_rate": 1.9755542922114838e-06, + "loss": 5.3272, + "step": 17395 + }, + { + "epoch": 0.9618573797678275, + "grad_norm": 3.0279312133789062, + "learning_rate": 1.961341671404207e-06, + "loss": 4.6455, + "step": 17400 + }, + { + "epoch": 0.9621337755666114, + "grad_norm": 3.420703649520874, + "learning_rate": 1.94712905059693e-06, + "loss": 4.8824, + "step": 17405 + }, + { + "epoch": 0.9624101713653952, + "grad_norm": 3.330737590789795, + "learning_rate": 1.9329164297896536e-06, + "loss": 4.7754, + "step": 17410 + }, + { + "epoch": 0.9626865671641791, + "grad_norm": 3.5434346199035645, + "learning_rate": 1.9187038089823763e-06, + "loss": 4.6785, + "step": 17415 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 4.19118595123291, + "learning_rate": 1.9044911881750997e-06, + "loss": 4.8247, + "step": 17420 + }, + { + "epoch": 0.9632393587617468, + "grad_norm": 4.7213454246521, + "learning_rate": 1.8902785673678228e-06, + "loss": 4.8312, + "step": 17425 + }, + { + "epoch": 0.9635157545605307, + "grad_norm": 3.6587038040161133, + "learning_rate": 1.8760659465605457e-06, + "loss": 5.1918, + "step": 17430 + }, + { + "epoch": 0.9637921503593145, + "grad_norm": 3.242537021636963, + "learning_rate": 1.861853325753269e-06, + "loss": 4.9542, + "step": 17435 + }, + { + "epoch": 0.9640685461580984, + "grad_norm": 3.7462520599365234, + "learning_rate": 1.847640704945992e-06, + "loss": 4.9511, + "step": 17440 + }, + { + "epoch": 0.9643449419568823, + "grad_norm": 3.6050407886505127, + "learning_rate": 1.8334280841387153e-06, + "loss": 4.8826, + "step": 17445 + }, + { + "epoch": 0.9646213377556662, + "grad_norm": 3.979299783706665, + "learning_rate": 1.8192154633314382e-06, + "loss": 4.879, + "step": 17450 + }, + { + "epoch": 0.96489773355445, + "grad_norm": 4.422558784484863, + "learning_rate": 1.8050028425241616e-06, + "loss": 5.2462, + "step": 17455 + }, + { + "epoch": 0.9651741293532339, + "grad_norm": 3.984792947769165, + "learning_rate": 1.7907902217168847e-06, + "loss": 4.9413, + "step": 17460 + }, + { + "epoch": 0.9654505251520177, + "grad_norm": 3.3358352184295654, + "learning_rate": 1.7765776009096078e-06, + "loss": 4.8726, + "step": 17465 + }, + { + "epoch": 0.9657269209508016, + "grad_norm": 3.0157341957092285, + "learning_rate": 1.762364980102331e-06, + "loss": 4.9832, + "step": 17470 + }, + { + "epoch": 0.9660033167495854, + "grad_norm": 3.5615592002868652, + "learning_rate": 1.7481523592950543e-06, + "loss": 4.6077, + "step": 17475 + }, + { + "epoch": 0.9662797125483693, + "grad_norm": 3.1383845806121826, + "learning_rate": 1.7339397384877772e-06, + "loss": 4.7842, + "step": 17480 + }, + { + "epoch": 0.9665561083471531, + "grad_norm": 4.033356189727783, + "learning_rate": 1.7197271176805002e-06, + "loss": 4.9063, + "step": 17485 + }, + { + "epoch": 0.966832504145937, + "grad_norm": 3.651676654815674, + "learning_rate": 1.7055144968732235e-06, + "loss": 4.7036, + "step": 17490 + }, + { + "epoch": 0.9671088999447208, + "grad_norm": 3.519984245300293, + "learning_rate": 1.6913018760659464e-06, + "loss": 4.9829, + "step": 17495 + }, + { + "epoch": 0.9673852957435047, + "grad_norm": 2.9878344535827637, + "learning_rate": 1.6770892552586698e-06, + "loss": 4.6443, + "step": 17500 + }, + { + "epoch": 0.9676616915422885, + "grad_norm": 3.7152388095855713, + "learning_rate": 1.662876634451393e-06, + "loss": 4.8157, + "step": 17505 + }, + { + "epoch": 0.9679380873410725, + "grad_norm": 3.1234991550445557, + "learning_rate": 1.6486640136441162e-06, + "loss": 5.0816, + "step": 17510 + }, + { + "epoch": 0.9682144831398563, + "grad_norm": 3.695985794067383, + "learning_rate": 1.6344513928368392e-06, + "loss": 5.1458, + "step": 17515 + }, + { + "epoch": 0.9684908789386402, + "grad_norm": 3.487590789794922, + "learning_rate": 1.6202387720295625e-06, + "loss": 4.5166, + "step": 17520 + }, + { + "epoch": 0.968767274737424, + "grad_norm": 3.8516764640808105, + "learning_rate": 1.6060261512222854e-06, + "loss": 5.0152, + "step": 17525 + }, + { + "epoch": 0.9690436705362079, + "grad_norm": 3.8771095275878906, + "learning_rate": 1.5918135304150088e-06, + "loss": 4.7823, + "step": 17530 + }, + { + "epoch": 0.9693200663349917, + "grad_norm": 4.3737287521362305, + "learning_rate": 1.5776009096077317e-06, + "loss": 4.905, + "step": 17535 + }, + { + "epoch": 0.9695964621337756, + "grad_norm": 4.190290451049805, + "learning_rate": 1.5633882888004548e-06, + "loss": 4.7303, + "step": 17540 + }, + { + "epoch": 0.9698728579325594, + "grad_norm": 4.203909397125244, + "learning_rate": 1.5491756679931782e-06, + "loss": 4.6031, + "step": 17545 + }, + { + "epoch": 0.9701492537313433, + "grad_norm": 3.6529483795166016, + "learning_rate": 1.5349630471859013e-06, + "loss": 4.9744, + "step": 17550 + }, + { + "epoch": 0.9704256495301271, + "grad_norm": 3.739197015762329, + "learning_rate": 1.5207504263786244e-06, + "loss": 4.8951, + "step": 17555 + }, + { + "epoch": 0.970702045328911, + "grad_norm": 4.100447177886963, + "learning_rate": 1.5065378055713474e-06, + "loss": 5.0774, + "step": 17560 + }, + { + "epoch": 0.9709784411276948, + "grad_norm": 4.293310165405273, + "learning_rate": 1.4923251847640705e-06, + "loss": 4.7931, + "step": 17565 + }, + { + "epoch": 0.9712548369264787, + "grad_norm": 3.8114206790924072, + "learning_rate": 1.4781125639567936e-06, + "loss": 4.6042, + "step": 17570 + }, + { + "epoch": 0.9715312327252625, + "grad_norm": 3.7172446250915527, + "learning_rate": 1.4638999431495168e-06, + "loss": 4.7856, + "step": 17575 + }, + { + "epoch": 0.9718076285240465, + "grad_norm": 3.316227912902832, + "learning_rate": 1.4496873223422399e-06, + "loss": 4.8669, + "step": 17580 + }, + { + "epoch": 0.9720840243228303, + "grad_norm": 4.058900833129883, + "learning_rate": 1.4354747015349632e-06, + "loss": 4.5917, + "step": 17585 + }, + { + "epoch": 0.9723604201216142, + "grad_norm": 3.633906841278076, + "learning_rate": 1.4212620807276864e-06, + "loss": 4.7329, + "step": 17590 + }, + { + "epoch": 0.972636815920398, + "grad_norm": 4.460212230682373, + "learning_rate": 1.4070494599204095e-06, + "loss": 4.7991, + "step": 17595 + }, + { + "epoch": 0.9729132117191819, + "grad_norm": 3.0964694023132324, + "learning_rate": 1.3928368391131326e-06, + "loss": 4.6162, + "step": 17600 + }, + { + "epoch": 0.9731896075179657, + "grad_norm": 3.333630323410034, + "learning_rate": 1.3786242183058558e-06, + "loss": 4.5838, + "step": 17605 + }, + { + "epoch": 0.9734660033167496, + "grad_norm": 3.469555139541626, + "learning_rate": 1.3644115974985787e-06, + "loss": 5.2188, + "step": 17610 + }, + { + "epoch": 0.9737423991155334, + "grad_norm": 3.8169009685516357, + "learning_rate": 1.3501989766913018e-06, + "loss": 5.018, + "step": 17615 + }, + { + "epoch": 0.9740187949143173, + "grad_norm": 4.118598461151123, + "learning_rate": 1.335986355884025e-06, + "loss": 5.0628, + "step": 17620 + }, + { + "epoch": 0.9742951907131011, + "grad_norm": 3.506039619445801, + "learning_rate": 1.3217737350767483e-06, + "loss": 4.6609, + "step": 17625 + }, + { + "epoch": 0.974571586511885, + "grad_norm": 3.59202241897583, + "learning_rate": 1.3075611142694714e-06, + "loss": 4.9948, + "step": 17630 + }, + { + "epoch": 0.9748479823106688, + "grad_norm": 3.3904547691345215, + "learning_rate": 1.2933484934621945e-06, + "loss": 5.0171, + "step": 17635 + }, + { + "epoch": 0.9751243781094527, + "grad_norm": 3.5725204944610596, + "learning_rate": 1.2791358726549177e-06, + "loss": 4.719, + "step": 17640 + }, + { + "epoch": 0.9754007739082365, + "grad_norm": 4.067680835723877, + "learning_rate": 1.2649232518476408e-06, + "loss": 5.1472, + "step": 17645 + }, + { + "epoch": 0.9756771697070205, + "grad_norm": 3.5419161319732666, + "learning_rate": 1.250710631040364e-06, + "loss": 4.9008, + "step": 17650 + }, + { + "epoch": 0.9759535655058044, + "grad_norm": 4.060971260070801, + "learning_rate": 1.236498010233087e-06, + "loss": 4.8635, + "step": 17655 + }, + { + "epoch": 0.9762299613045882, + "grad_norm": 3.4809699058532715, + "learning_rate": 1.22228538942581e-06, + "loss": 4.7176, + "step": 17660 + }, + { + "epoch": 0.9765063571033721, + "grad_norm": 3.121370792388916, + "learning_rate": 1.2080727686185333e-06, + "loss": 4.7872, + "step": 17665 + }, + { + "epoch": 0.9767827529021559, + "grad_norm": 3.8386788368225098, + "learning_rate": 1.1938601478112565e-06, + "loss": 4.6917, + "step": 17670 + }, + { + "epoch": 0.9770591487009398, + "grad_norm": 3.4671096801757812, + "learning_rate": 1.1796475270039796e-06, + "loss": 5.0222, + "step": 17675 + }, + { + "epoch": 0.9773355444997236, + "grad_norm": 4.064565658569336, + "learning_rate": 1.1654349061967027e-06, + "loss": 4.9298, + "step": 17680 + }, + { + "epoch": 0.9776119402985075, + "grad_norm": 3.155139923095703, + "learning_rate": 1.1512222853894259e-06, + "loss": 4.7341, + "step": 17685 + }, + { + "epoch": 0.9778883360972913, + "grad_norm": 3.358921527862549, + "learning_rate": 1.137009664582149e-06, + "loss": 4.8701, + "step": 17690 + }, + { + "epoch": 0.9781647318960752, + "grad_norm": 3.9727041721343994, + "learning_rate": 1.1227970437748721e-06, + "loss": 4.9399, + "step": 17695 + }, + { + "epoch": 0.978441127694859, + "grad_norm": 3.551064968109131, + "learning_rate": 1.1085844229675953e-06, + "loss": 4.4855, + "step": 17700 + }, + { + "epoch": 0.9787175234936429, + "grad_norm": 3.285154104232788, + "learning_rate": 1.0943718021603184e-06, + "loss": 4.6904, + "step": 17705 + }, + { + "epoch": 0.9789939192924267, + "grad_norm": 3.77357816696167, + "learning_rate": 1.0801591813530415e-06, + "loss": 4.7048, + "step": 17710 + }, + { + "epoch": 0.9792703150912107, + "grad_norm": 3.7652862071990967, + "learning_rate": 1.0659465605457647e-06, + "loss": 4.8469, + "step": 17715 + }, + { + "epoch": 0.9795467108899945, + "grad_norm": 4.785614013671875, + "learning_rate": 1.0517339397384878e-06, + "loss": 4.6331, + "step": 17720 + }, + { + "epoch": 0.9798231066887784, + "grad_norm": 3.4421780109405518, + "learning_rate": 1.037521318931211e-06, + "loss": 4.8027, + "step": 17725 + }, + { + "epoch": 0.9800995024875622, + "grad_norm": 3.414224863052368, + "learning_rate": 1.023308698123934e-06, + "loss": 4.6746, + "step": 17730 + }, + { + "epoch": 0.9803758982863461, + "grad_norm": 3.6963014602661133, + "learning_rate": 1.0090960773166572e-06, + "loss": 4.9759, + "step": 17735 + }, + { + "epoch": 0.9806522940851299, + "grad_norm": 4.396233558654785, + "learning_rate": 9.948834565093803e-07, + "loss": 4.9611, + "step": 17740 + }, + { + "epoch": 0.9809286898839138, + "grad_norm": 3.8200385570526123, + "learning_rate": 9.806708357021035e-07, + "loss": 4.7393, + "step": 17745 + }, + { + "epoch": 0.9812050856826976, + "grad_norm": 3.469111680984497, + "learning_rate": 9.664582148948268e-07, + "loss": 4.9948, + "step": 17750 + }, + { + "epoch": 0.9814814814814815, + "grad_norm": 3.5124690532684326, + "learning_rate": 9.522455940875498e-07, + "loss": 4.9789, + "step": 17755 + }, + { + "epoch": 0.9817578772802653, + "grad_norm": 3.9192700386047363, + "learning_rate": 9.380329732802729e-07, + "loss": 4.7527, + "step": 17760 + }, + { + "epoch": 0.9820342730790492, + "grad_norm": 3.603703260421753, + "learning_rate": 9.23820352472996e-07, + "loss": 4.8427, + "step": 17765 + }, + { + "epoch": 0.982310668877833, + "grad_norm": 3.82257342338562, + "learning_rate": 9.096077316657191e-07, + "loss": 4.6548, + "step": 17770 + }, + { + "epoch": 0.9825870646766169, + "grad_norm": 3.6828839778900146, + "learning_rate": 8.953951108584424e-07, + "loss": 4.7383, + "step": 17775 + }, + { + "epoch": 0.9828634604754007, + "grad_norm": 4.09072732925415, + "learning_rate": 8.811824900511655e-07, + "loss": 4.6382, + "step": 17780 + }, + { + "epoch": 0.9831398562741847, + "grad_norm": 3.557931900024414, + "learning_rate": 8.669698692438886e-07, + "loss": 4.7966, + "step": 17785 + }, + { + "epoch": 0.9834162520729685, + "grad_norm": 3.4907984733581543, + "learning_rate": 8.527572484366118e-07, + "loss": 5.0257, + "step": 17790 + }, + { + "epoch": 0.9836926478717524, + "grad_norm": 4.357024192810059, + "learning_rate": 8.385446276293349e-07, + "loss": 5.2176, + "step": 17795 + }, + { + "epoch": 0.9839690436705362, + "grad_norm": 4.168322563171387, + "learning_rate": 8.243320068220581e-07, + "loss": 4.9219, + "step": 17800 + }, + { + "epoch": 0.9842454394693201, + "grad_norm": 4.186100006103516, + "learning_rate": 8.101193860147813e-07, + "loss": 5.1336, + "step": 17805 + }, + { + "epoch": 0.9845218352681039, + "grad_norm": 4.123225212097168, + "learning_rate": 7.959067652075044e-07, + "loss": 4.7761, + "step": 17810 + }, + { + "epoch": 0.9847982310668878, + "grad_norm": 4.013276100158691, + "learning_rate": 7.816941444002274e-07, + "loss": 4.8168, + "step": 17815 + }, + { + "epoch": 0.9850746268656716, + "grad_norm": 3.063643217086792, + "learning_rate": 7.674815235929507e-07, + "loss": 4.8048, + "step": 17820 + }, + { + "epoch": 0.9853510226644555, + "grad_norm": 4.011346817016602, + "learning_rate": 7.532689027856737e-07, + "loss": 4.3031, + "step": 17825 + }, + { + "epoch": 0.9856274184632393, + "grad_norm": 4.323795318603516, + "learning_rate": 7.390562819783968e-07, + "loss": 4.7946, + "step": 17830 + }, + { + "epoch": 0.9859038142620232, + "grad_norm": 4.777589321136475, + "learning_rate": 7.248436611711199e-07, + "loss": 4.6589, + "step": 17835 + }, + { + "epoch": 0.986180210060807, + "grad_norm": 4.321505546569824, + "learning_rate": 7.106310403638432e-07, + "loss": 4.9267, + "step": 17840 + }, + { + "epoch": 0.9864566058595909, + "grad_norm": 3.9251480102539062, + "learning_rate": 6.964184195565663e-07, + "loss": 4.6038, + "step": 17845 + }, + { + "epoch": 0.9867330016583747, + "grad_norm": 3.0840792655944824, + "learning_rate": 6.822057987492893e-07, + "loss": 4.5904, + "step": 17850 + }, + { + "epoch": 0.9870093974571587, + "grad_norm": 4.160499572753906, + "learning_rate": 6.679931779420125e-07, + "loss": 4.8037, + "step": 17855 + }, + { + "epoch": 0.9872857932559425, + "grad_norm": 4.162237167358398, + "learning_rate": 6.537805571347357e-07, + "loss": 5.0783, + "step": 17860 + }, + { + "epoch": 0.9875621890547264, + "grad_norm": 3.218111515045166, + "learning_rate": 6.395679363274588e-07, + "loss": 4.8852, + "step": 17865 + }, + { + "epoch": 0.9878385848535103, + "grad_norm": 3.7877068519592285, + "learning_rate": 6.25355315520182e-07, + "loss": 5.0094, + "step": 17870 + }, + { + "epoch": 0.9881149806522941, + "grad_norm": 3.7480013370513916, + "learning_rate": 6.11142694712905e-07, + "loss": 4.9167, + "step": 17875 + }, + { + "epoch": 0.988391376451078, + "grad_norm": 3.9518656730651855, + "learning_rate": 5.969300739056282e-07, + "loss": 4.9065, + "step": 17880 + }, + { + "epoch": 0.9886677722498618, + "grad_norm": 4.303936958312988, + "learning_rate": 5.827174530983514e-07, + "loss": 5.0365, + "step": 17885 + }, + { + "epoch": 0.9889441680486457, + "grad_norm": 3.944624423980713, + "learning_rate": 5.685048322910745e-07, + "loss": 4.9906, + "step": 17890 + }, + { + "epoch": 0.9892205638474295, + "grad_norm": 3.4610772132873535, + "learning_rate": 5.542922114837976e-07, + "loss": 4.6898, + "step": 17895 + }, + { + "epoch": 0.9894969596462134, + "grad_norm": 3.5662105083465576, + "learning_rate": 5.400795906765208e-07, + "loss": 5.1878, + "step": 17900 + }, + { + "epoch": 0.9897733554449972, + "grad_norm": 3.8425076007843018, + "learning_rate": 5.258669698692439e-07, + "loss": 4.482, + "step": 17905 + }, + { + "epoch": 0.9900497512437811, + "grad_norm": 3.6111109256744385, + "learning_rate": 5.11654349061967e-07, + "loss": 4.8749, + "step": 17910 + }, + { + "epoch": 0.9903261470425649, + "grad_norm": 3.664844274520874, + "learning_rate": 4.974417282546902e-07, + "loss": 5.1134, + "step": 17915 + }, + { + "epoch": 0.9906025428413489, + "grad_norm": 3.341689109802246, + "learning_rate": 4.832291074474134e-07, + "loss": 5.1036, + "step": 17920 + }, + { + "epoch": 0.9908789386401327, + "grad_norm": 3.0677828788757324, + "learning_rate": 4.6901648664013643e-07, + "loss": 5.0625, + "step": 17925 + }, + { + "epoch": 0.9911553344389166, + "grad_norm": 4.164408206939697, + "learning_rate": 4.5480386583285956e-07, + "loss": 4.8023, + "step": 17930 + }, + { + "epoch": 0.9914317302377004, + "grad_norm": 3.8659586906433105, + "learning_rate": 4.4059124502558274e-07, + "loss": 4.9043, + "step": 17935 + }, + { + "epoch": 0.9917081260364843, + "grad_norm": 4.336862087249756, + "learning_rate": 4.263786242183059e-07, + "loss": 5.1321, + "step": 17940 + }, + { + "epoch": 0.9919845218352681, + "grad_norm": 3.780893325805664, + "learning_rate": 4.1216600341102906e-07, + "loss": 5.1098, + "step": 17945 + }, + { + "epoch": 0.992260917634052, + "grad_norm": 3.7630019187927246, + "learning_rate": 3.979533826037522e-07, + "loss": 4.7298, + "step": 17950 + }, + { + "epoch": 0.9925373134328358, + "grad_norm": 4.459870338439941, + "learning_rate": 3.837407617964753e-07, + "loss": 5.0754, + "step": 17955 + }, + { + "epoch": 0.9928137092316197, + "grad_norm": 3.592924118041992, + "learning_rate": 3.695281409891984e-07, + "loss": 4.9391, + "step": 17960 + }, + { + "epoch": 0.9930901050304035, + "grad_norm": 3.273549795150757, + "learning_rate": 3.553155201819216e-07, + "loss": 4.6995, + "step": 17965 + }, + { + "epoch": 0.9933665008291874, + "grad_norm": 3.976041555404663, + "learning_rate": 3.4110289937464467e-07, + "loss": 5.0032, + "step": 17970 + }, + { + "epoch": 0.9936428966279712, + "grad_norm": 3.7148942947387695, + "learning_rate": 3.2689027856736785e-07, + "loss": 4.9053, + "step": 17975 + }, + { + "epoch": 0.9939192924267551, + "grad_norm": 4.372527122497559, + "learning_rate": 3.12677657760091e-07, + "loss": 4.6127, + "step": 17980 + }, + { + "epoch": 0.9941956882255389, + "grad_norm": 4.019134521484375, + "learning_rate": 2.984650369528141e-07, + "loss": 4.9816, + "step": 17985 + }, + { + "epoch": 0.9944720840243229, + "grad_norm": 4.684123992919922, + "learning_rate": 2.8425241614553725e-07, + "loss": 4.9317, + "step": 17990 + }, + { + "epoch": 0.9947484798231067, + "grad_norm": 4.50031852722168, + "learning_rate": 2.700397953382604e-07, + "loss": 4.8163, + "step": 17995 + }, + { + "epoch": 0.9950248756218906, + "grad_norm": 3.7687060832977295, + "learning_rate": 2.558271745309835e-07, + "loss": 4.8997, + "step": 18000 + }, + { + "epoch": 0.9953012714206744, + "grad_norm": 5.449435234069824, + "learning_rate": 2.416145537237067e-07, + "loss": 4.8225, + "step": 18005 + }, + { + "epoch": 0.9955776672194583, + "grad_norm": 3.540816068649292, + "learning_rate": 2.2740193291642978e-07, + "loss": 4.8916, + "step": 18010 + }, + { + "epoch": 0.9958540630182421, + "grad_norm": 3.5293331146240234, + "learning_rate": 2.1318931210915294e-07, + "loss": 4.8202, + "step": 18015 + }, + { + "epoch": 0.996130458817026, + "grad_norm": 4.026026248931885, + "learning_rate": 1.989766913018761e-07, + "loss": 5.0024, + "step": 18020 + }, + { + "epoch": 0.9964068546158098, + "grad_norm": 4.405704021453857, + "learning_rate": 1.847640704945992e-07, + "loss": 4.9342, + "step": 18025 + }, + { + "epoch": 0.9966832504145937, + "grad_norm": 3.3129539489746094, + "learning_rate": 1.7055144968732233e-07, + "loss": 4.8739, + "step": 18030 + }, + { + "epoch": 0.9969596462133775, + "grad_norm": 4.042524814605713, + "learning_rate": 1.563388288800455e-07, + "loss": 4.9343, + "step": 18035 + }, + { + "epoch": 0.9972360420121614, + "grad_norm": 3.105717658996582, + "learning_rate": 1.4212620807276863e-07, + "loss": 4.463, + "step": 18040 + }, + { + "epoch": 0.9975124378109452, + "grad_norm": 3.765864133834839, + "learning_rate": 1.2791358726549176e-07, + "loss": 4.9118, + "step": 18045 + }, + { + "epoch": 0.9977888336097291, + "grad_norm": 4.456231594085693, + "learning_rate": 1.1370096645821489e-07, + "loss": 4.8341, + "step": 18050 + }, + { + "epoch": 0.9980652294085129, + "grad_norm": 3.701779365539551, + "learning_rate": 9.948834565093805e-08, + "loss": 4.9091, + "step": 18055 + }, + { + "epoch": 0.9983416252072969, + "grad_norm": 3.981372833251953, + "learning_rate": 8.527572484366117e-08, + "loss": 4.9035, + "step": 18060 + }, + { + "epoch": 0.9986180210060807, + "grad_norm": 4.189070224761963, + "learning_rate": 7.106310403638431e-08, + "loss": 4.8013, + "step": 18065 + }, + { + "epoch": 0.9988944168048646, + "grad_norm": 3.8687007427215576, + "learning_rate": 5.6850483229107445e-08, + "loss": 4.7042, + "step": 18070 + }, + { + "epoch": 0.9991708126036484, + "grad_norm": 4.483328342437744, + "learning_rate": 4.2637862421830584e-08, + "loss": 4.6875, + "step": 18075 + }, + { + "epoch": 0.9994472084024323, + "grad_norm": 4.180428981781006, + "learning_rate": 2.8425241614553722e-08, + "loss": 4.816, + "step": 18080 + }, + { + "epoch": 0.9997236042012161, + "grad_norm": 3.824141025543213, + "learning_rate": 1.4212620807276861e-08, + "loss": 4.8663, + "step": 18085 + }, + { + "epoch": 1.0, + "grad_norm": 3.4549672603607178, + "learning_rate": 0.0, + "loss": 4.3516, + "step": 18090 } ], "logging_steps": 5, @@ -16821,12 +25347,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} } }, - "total_flos": 1.1151619342811136e+16, + "total_flos": 1.6815556451321856e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null