diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5518 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5, + "eval_steps": 250, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001, + "grad_norm": 0.000537872314453125, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.0002, + "loss/crossentropy": 0.8766392022371292, + "loss/hidden": 0.0, + "loss/logits": 0.00021765431665698998, + "step": 1 + }, + { + "epoch": 0.002, + "grad_norm": 0.2265625, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.005, + "loss/crossentropy": 1.9883175492286682, + "loss/hidden": 0.0039215087890625, + "loss/logits": 0.001088879187591374, + "step": 2 + }, + { + "epoch": 0.003, + "grad_norm": 0.25390625, + "learning_rate": 6.000000000000001e-07, + "loss": 0.0052, + "loss/crossentropy": 1.8020615577697754, + "loss/hidden": 0.004180908203125, + "loss/logits": 0.0010398300073575228, + "step": 3 + }, + { + "epoch": 0.004, + "grad_norm": 0.255859375, + "learning_rate": 8.000000000000001e-07, + "loss": 0.0049, + "loss/crossentropy": 1.0764193534851074, + "loss/hidden": 0.00399017333984375, + "loss/logits": 0.0008995172393042594, + "step": 4 + }, + { + "epoch": 0.005, + "grad_norm": 0.224609375, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0049, + "loss/crossentropy": 1.7853868007659912, + "loss/hidden": 0.0038604736328125, + "loss/logits": 0.0010730837238952518, + "step": 5 + }, + { + "epoch": 0.006, + "grad_norm": 0.2333984375, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0051, + "loss/crossentropy": 2.4102118015289307, + "loss/hidden": 0.00388336181640625, + "loss/logits": 0.0011915687937289476, + "step": 6 + }, + { + "epoch": 0.007, + "grad_norm": 0.35546875, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.0056, + "loss/crossentropy": 1.9921993017196655, + "loss/hidden": 0.0044403076171875, + "loss/logits": 0.0011139529524371028, + "step": 7 + }, + { + "epoch": 0.008, + "grad_norm": 0.2353515625, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0049, + "loss/crossentropy": 2.269957184791565, + "loss/hidden": 0.00376129150390625, + "loss/logits": 0.0011444001575000584, + "step": 8 + }, + { + "epoch": 0.009, + "grad_norm": 0.22265625, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.0051, + "loss/crossentropy": 2.1889681220054626, + "loss/hidden": 0.0038909912109375, + "loss/logits": 0.0011716101435013115, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 0.291015625, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0052, + "loss/crossentropy": 1.76205712556839, + "loss/hidden": 0.0041351318359375, + "loss/logits": 0.001058999594533816, + "step": 10 + }, + { + "epoch": 0.011, + "grad_norm": 0.2177734375, + "learning_rate": 2.2e-06, + "loss": 0.0049, + "loss/crossentropy": 2.438264012336731, + "loss/hidden": 0.003753662109375, + "loss/logits": 0.0011843050015158951, + "step": 11 + }, + { + "epoch": 0.012, + "grad_norm": 0.41015625, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.0071, + "loss/crossentropy": 1.8871825337409973, + "loss/hidden": 0.0059051513671875, + "loss/logits": 0.0011930759064853191, + "step": 12 + }, + { + "epoch": 0.013, + "grad_norm": 0.53125, + "learning_rate": 2.6e-06, + "loss": 0.0084, + "loss/crossentropy": 1.7400972247123718, + "loss/hidden": 0.0071258544921875, + "loss/logits": 0.001270102453418076, + "step": 13 + }, + { + "epoch": 0.014, + "grad_norm": 0.365234375, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.0075, + "loss/crossentropy": 2.0053656101226807, + "loss/hidden": 0.006256103515625, + "loss/logits": 0.0012446122709661722, + "step": 14 + }, + { + "epoch": 0.015, + "grad_norm": 0.455078125, + "learning_rate": 3e-06, + "loss": 0.0072, + "loss/crossentropy": 1.984630048274994, + "loss/hidden": 0.0059356689453125, + "loss/logits": 0.0012947238283231854, + "step": 15 + }, + { + "epoch": 0.016, + "grad_norm": 0.447265625, + "grad_norm_var": 0.016307008621940136, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.0072, + "loss/crossentropy": 2.4732788801193237, + "loss/hidden": 0.005767822265625, + "loss/logits": 0.00144299550447613, + "step": 16 + }, + { + "epoch": 0.017, + "grad_norm": 0.89453125, + "grad_norm_var": 0.031113270918528238, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0076, + "loss/crossentropy": 1.7775737643241882, + "loss/hidden": 0.006317138671875, + "loss/logits": 0.001260987774003297, + "step": 17 + }, + { + "epoch": 0.018, + "grad_norm": 0.45703125, + "grad_norm_var": 0.030601243178049724, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.0067, + "loss/crossentropy": 1.1123631671071053, + "loss/hidden": 0.0057373046875, + "loss/logits": 0.0009507400100119412, + "step": 18 + }, + { + "epoch": 0.019, + "grad_norm": 0.298828125, + "grad_norm_var": 0.030057998498280843, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0068, + "loss/crossentropy": 1.8855515718460083, + "loss/hidden": 0.0055694580078125, + "loss/logits": 0.0012491169618442655, + "step": 19 + }, + { + "epoch": 0.02, + "grad_norm": 0.3984375, + "grad_norm_var": 0.02918777068456014, + "learning_rate": 4.000000000000001e-06, + "loss": 0.007, + "loss/crossentropy": 1.773246705532074, + "loss/hidden": 0.005828857421875, + "loss/logits": 0.0011664124322123826, + "step": 20 + }, + { + "epoch": 0.021, + "grad_norm": 0.302734375, + "grad_norm_var": 0.02797787586847941, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0069, + "loss/crossentropy": 2.1012651920318604, + "loss/hidden": 0.0056610107421875, + "loss/logits": 0.0012796117807738483, + "step": 21 + }, + { + "epoch": 0.022, + "grad_norm": 0.486328125, + "grad_norm_var": 0.026955906550089517, + "learning_rate": 4.4e-06, + "loss": 0.0101, + "loss/crossentropy": 1.9430513381958008, + "loss/hidden": 0.008514404296875, + "loss/logits": 0.0016175230266526341, + "step": 22 + }, + { + "epoch": 0.023, + "grad_norm": 0.609375, + "grad_norm_var": 0.029542907079060873, + "learning_rate": 4.600000000000001e-06, + "loss": 0.0118, + "loss/crossentropy": 1.5989271998405457, + "loss/hidden": 0.01025390625, + "loss/logits": 0.0015109491650946438, + "step": 23 + }, + { + "epoch": 0.024, + "grad_norm": 0.80078125, + "grad_norm_var": 0.03606090148289998, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0102, + "loss/crossentropy": 1.141058474779129, + "loss/hidden": 0.009033203125, + "loss/logits": 0.0011210083321202546, + "step": 24 + }, + { + "epoch": 0.025, + "grad_norm": 0.361328125, + "grad_norm_var": 0.03307259480158488, + "learning_rate": 5e-06, + "loss": 0.0094, + "loss/crossentropy": 2.0950170755386353, + "loss/hidden": 0.0077972412109375, + "loss/logits": 0.001559894997626543, + "step": 25 + }, + { + "epoch": 0.026, + "grad_norm": 0.83984375, + "grad_norm_var": 0.0396828293800354, + "learning_rate": 5.2e-06, + "loss": 0.0112, + "loss/crossentropy": 0.9552253857254982, + "loss/hidden": 0.010284423828125, + "loss/logits": 0.0008805262332316488, + "step": 26 + }, + { + "epoch": 0.027, + "grad_norm": 0.546875, + "grad_norm_var": 0.034408044815063474, + "learning_rate": 5.400000000000001e-06, + "loss": 0.0091, + "loss/crossentropy": 1.3719437271356583, + "loss/hidden": 0.007965087890625, + "loss/logits": 0.001155910431407392, + "step": 27 + }, + { + "epoch": 0.028, + "grad_norm": 0.73046875, + "grad_norm_var": 0.036436065038045244, + "learning_rate": 5.600000000000001e-06, + "loss": 0.0107, + "loss/crossentropy": 1.6477643251419067, + "loss/hidden": 0.009185791015625, + "loss/logits": 0.0015593590214848518, + "step": 28 + }, + { + "epoch": 0.029, + "grad_norm": 0.41796875, + "grad_norm_var": 0.03726207415262858, + "learning_rate": 5.8e-06, + "loss": 0.0096, + "loss/crossentropy": 1.7987680435180664, + "loss/hidden": 0.008087158203125, + "loss/logits": 0.0015162223717197776, + "step": 29 + }, + { + "epoch": 0.03, + "grad_norm": 0.33203125, + "grad_norm_var": 0.03804162343343099, + "learning_rate": 6e-06, + "loss": 0.0094, + "loss/crossentropy": 1.74210923910141, + "loss/hidden": 0.008026123046875, + "loss/logits": 0.0013514517340809107, + "step": 30 + }, + { + "epoch": 0.031, + "grad_norm": 0.4296875, + "grad_norm_var": 0.038314167658487955, + "learning_rate": 6.200000000000001e-06, + "loss": 0.0095, + "loss/crossentropy": 1.45715793967247, + "loss/hidden": 0.0081329345703125, + "loss/logits": 0.0013754194369539618, + "step": 31 + }, + { + "epoch": 0.032, + "grad_norm": 0.54296875, + "grad_norm_var": 0.03793176015218099, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.0137, + "loss/crossentropy": 1.635874330997467, + "loss/hidden": 0.01190185546875, + "loss/logits": 0.0017871989402920008, + "step": 32 + }, + { + "epoch": 0.033, + "grad_norm": 0.76171875, + "grad_norm_var": 0.03254489898681641, + "learning_rate": 6.600000000000001e-06, + "loss": 0.0143, + "loss/crossentropy": 1.0347481966018677, + "loss/hidden": 0.01300048828125, + "loss/logits": 0.0012789819156751037, + "step": 33 + }, + { + "epoch": 0.034, + "grad_norm": 0.515625, + "grad_norm_var": 0.032269287109375, + "learning_rate": 6.800000000000001e-06, + "loss": 0.0132, + "loss/crossentropy": 2.0032879114151, + "loss/hidden": 0.011383056640625, + "loss/logits": 0.0018645224627107382, + "step": 34 + }, + { + "epoch": 0.035, + "grad_norm": 1.0703125, + "grad_norm_var": 0.04636419614156087, + "learning_rate": 7e-06, + "loss": 0.0143, + "loss/crossentropy": 1.8410796523094177, + "loss/hidden": 0.01226806640625, + "loss/logits": 0.001986370305530727, + "step": 35 + }, + { + "epoch": 0.036, + "grad_norm": 0.4296875, + "grad_norm_var": 0.045703490575154625, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.0136, + "loss/crossentropy": 1.9098870158195496, + "loss/hidden": 0.01171875, + "loss/logits": 0.0018596722511574626, + "step": 36 + }, + { + "epoch": 0.037, + "grad_norm": 68.0, + "grad_norm_var": 284.03319854736327, + "learning_rate": 7.4e-06, + "loss": 0.0558, + "loss/crossentropy": 1.5951663255691528, + "loss/hidden": 0.051666259765625, + "loss/logits": 0.004160793498158455, + "step": 37 + }, + { + "epoch": 0.038, + "grad_norm": 0.380859375, + "grad_norm_var": 284.0946207046509, + "learning_rate": 7.600000000000001e-06, + "loss": 0.0133, + "loss/crossentropy": 2.25837504863739, + "loss/hidden": 0.01129150390625, + "loss/logits": 0.0020168160554021597, + "step": 38 + }, + { + "epoch": 0.039, + "grad_norm": 0.455078125, + "grad_norm_var": 284.1822828769684, + "learning_rate": 7.800000000000002e-06, + "loss": 0.0126, + "loss/crossentropy": 2.126526176929474, + "loss/hidden": 0.0107421875, + "loss/logits": 0.0018400833941996098, + "step": 39 + }, + { + "epoch": 0.04, + "grad_norm": 0.63671875, + "grad_norm_var": 284.27119545936586, + "learning_rate": 8.000000000000001e-06, + "loss": 0.0142, + "loss/crossentropy": 1.4863142371177673, + "loss/hidden": 0.012481689453125, + "loss/logits": 0.0017027563299052417, + "step": 40 + }, + { + "epoch": 0.041, + "grad_norm": 0.283203125, + "grad_norm_var": 284.3175859928131, + "learning_rate": 8.2e-06, + "loss": 0.0112, + "loss/crossentropy": 2.0888695120811462, + "loss/hidden": 0.009521484375, + "loss/logits": 0.0017255974235013127, + "step": 41 + }, + { + "epoch": 0.042, + "grad_norm": 0.431640625, + "grad_norm_var": 284.5420877456665, + "learning_rate": 8.400000000000001e-06, + "loss": 0.0173, + "loss/crossentropy": 1.611488163471222, + "loss/hidden": 0.015380859375, + "loss/logits": 0.0019445380312390625, + "step": 42 + }, + { + "epoch": 0.043, + "grad_norm": 0.419921875, + "grad_norm_var": 284.6142045180003, + "learning_rate": 8.6e-06, + "loss": 0.0166, + "loss/crossentropy": 1.8987411260604858, + "loss/hidden": 0.0146484375, + "loss/logits": 0.0019467678503133357, + "step": 43 + }, + { + "epoch": 0.044, + "grad_norm": 0.58203125, + "grad_norm_var": 284.6949343204498, + "learning_rate": 8.8e-06, + "loss": 0.0183, + "loss/crossentropy": 1.4084473848342896, + "loss/hidden": 0.01605224609375, + "loss/logits": 0.002271471545100212, + "step": 44 + }, + { + "epoch": 0.045, + "grad_norm": 0.380859375, + "grad_norm_var": 284.71635888417563, + "learning_rate": 9e-06, + "loss": 0.0159, + "loss/crossentropy": 1.6970309615135193, + "loss/hidden": 0.01397705078125, + "loss/logits": 0.0019325784523971379, + "step": 45 + }, + { + "epoch": 0.046, + "grad_norm": 0.455078125, + "grad_norm_var": 284.64517935117084, + "learning_rate": 9.200000000000002e-06, + "loss": 0.0165, + "loss/crossentropy": 2.1346731781959534, + "loss/hidden": 0.014312744140625, + "loss/logits": 0.002142712823115289, + "step": 46 + }, + { + "epoch": 0.047, + "grad_norm": 2.21875, + "grad_norm_var": 283.818000014623, + "learning_rate": 9.4e-06, + "loss": 0.0175, + "loss/crossentropy": 1.6114214062690735, + "loss/hidden": 0.0155029296875, + "loss/logits": 0.0020421514636836946, + "step": 47 + }, + { + "epoch": 0.048, + "grad_norm": 0.44921875, + "grad_norm_var": 283.87235945065817, + "learning_rate": 9.600000000000001e-06, + "loss": 0.0157, + "loss/crossentropy": 2.056842625141144, + "loss/hidden": 0.013671875, + "loss/logits": 0.0020451846066862345, + "step": 48 + }, + { + "epoch": 0.049, + "grad_norm": 0.439453125, + "grad_norm_var": 284.05417149861654, + "learning_rate": 9.800000000000001e-06, + "loss": 0.016, + "loss/crossentropy": 1.5892411470413208, + "loss/hidden": 0.013946533203125, + "loss/logits": 0.00205704930704087, + "step": 49 + }, + { + "epoch": 0.05, + "grad_norm": 0.3359375, + "grad_norm_var": 284.15935770670575, + "learning_rate": 1e-05, + "loss": 0.0153, + "loss/crossentropy": 2.3872954845428467, + "loss/hidden": 0.01312255859375, + "loss/logits": 0.0021313573233783245, + "step": 50 + }, + { + "epoch": 0.051, + "grad_norm": 0.451171875, + "grad_norm_var": 284.49208029111225, + "learning_rate": 1.02e-05, + "loss": 0.0168, + "loss/crossentropy": 2.0149841904640198, + "loss/hidden": 0.01470947265625, + "loss/logits": 0.0020815907046198845, + "step": 51 + }, + { + "epoch": 0.052, + "grad_norm": 0.51953125, + "grad_norm_var": 284.44056928952534, + "learning_rate": 1.04e-05, + "loss": 0.021, + "loss/crossentropy": 1.9311216473579407, + "loss/hidden": 0.0185546875, + "loss/logits": 0.0024686548858880997, + "step": 52 + }, + { + "epoch": 0.053, + "grad_norm": 0.546875, + "grad_norm_var": 0.20315702756245932, + "learning_rate": 1.0600000000000002e-05, + "loss": 0.0204, + "loss/crossentropy": 1.9871841073036194, + "loss/hidden": 0.01806640625, + "loss/logits": 0.00237347767688334, + "step": 53 + }, + { + "epoch": 0.054, + "grad_norm": 0.51171875, + "grad_norm_var": 0.2010729471842448, + "learning_rate": 1.0800000000000002e-05, + "loss": 0.0195, + "loss/crossentropy": 1.4909774661064148, + "loss/hidden": 0.017578125, + "loss/logits": 0.0018839699332602322, + "step": 54 + }, + { + "epoch": 0.055, + "grad_norm": 0.376953125, + "grad_norm_var": 0.20264968872070313, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.0188, + "loss/crossentropy": 1.731587290763855, + "loss/hidden": 0.01666259765625, + "loss/logits": 0.0021363290725275874, + "step": 55 + }, + { + "epoch": 0.056, + "grad_norm": 0.482421875, + "grad_norm_var": 0.20266098976135255, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.0198, + "loss/crossentropy": 1.8391692638397217, + "loss/hidden": 0.01751708984375, + "loss/logits": 0.0022706754971295595, + "step": 56 + }, + { + "epoch": 0.057, + "grad_norm": 0.82421875, + "grad_norm_var": 0.20132694244384766, + "learning_rate": 1.14e-05, + "loss": 0.0181, + "loss/crossentropy": 1.326266534626484, + "loss/hidden": 0.01654052734375, + "loss/logits": 0.0015604346699547023, + "step": 57 + }, + { + "epoch": 0.058, + "grad_norm": 0.41015625, + "grad_norm_var": 0.2018068790435791, + "learning_rate": 1.16e-05, + "loss": 0.0185, + "loss/crossentropy": 2.5511186122894287, + "loss/hidden": 0.01611328125, + "loss/logits": 0.0024241225328296423, + "step": 58 + }, + { + "epoch": 0.059, + "grad_norm": 1.609375, + "grad_norm_var": 0.26361236572265623, + "learning_rate": 1.18e-05, + "loss": 0.0183, + "loss/crossentropy": 1.0930684125050902, + "loss/hidden": 0.01702880859375, + "loss/logits": 0.0013018156460020691, + "step": 59 + }, + { + "epoch": 0.06, + "grad_norm": 0.486328125, + "grad_norm_var": 0.2652066389719645, + "learning_rate": 1.2e-05, + "loss": 0.02, + "loss/crossentropy": 2.0819135308265686, + "loss/hidden": 0.0174560546875, + "loss/logits": 0.0025293552316725254, + "step": 60 + }, + { + "epoch": 0.061, + "grad_norm": 1.09375, + "grad_norm_var": 0.2708051045735677, + "learning_rate": 1.22e-05, + "loss": 0.0183, + "loss/crossentropy": 0.9290539920330048, + "loss/hidden": 0.016754150390625, + "loss/logits": 0.0015562092885375023, + "step": 61 + }, + { + "epoch": 0.062, + "grad_norm": 0.453125, + "grad_norm_var": 0.2708693027496338, + "learning_rate": 1.2400000000000002e-05, + "loss": 0.0227, + "loss/crossentropy": 2.1691651344299316, + "loss/hidden": 0.01995849609375, + "loss/logits": 0.002767750178463757, + "step": 62 + }, + { + "epoch": 0.063, + "grad_norm": 0.4765625, + "grad_norm_var": 0.10790785153706868, + "learning_rate": 1.2600000000000001e-05, + "loss": 0.0233, + "loss/crossentropy": 2.1545491218566895, + "loss/hidden": 0.0205078125, + "loss/logits": 0.002785824006423354, + "step": 63 + }, + { + "epoch": 0.064, + "grad_norm": 0.47265625, + "grad_norm_var": 0.10749700864156088, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.0223, + "loss/crossentropy": 1.9527725577354431, + "loss/hidden": 0.01971435546875, + "loss/logits": 0.0025634407065808773, + "step": 64 + }, + { + "epoch": 0.065, + "grad_norm": 0.55078125, + "grad_norm_var": 0.10599034627278646, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.0256, + "loss/crossentropy": 1.8496606945991516, + "loss/hidden": 0.02288818359375, + "loss/logits": 0.0027499888092279434, + "step": 65 + }, + { + "epoch": 0.066, + "grad_norm": 0.55859375, + "grad_norm_var": 0.1012465794881185, + "learning_rate": 1.3200000000000002e-05, + "loss": 0.0221, + "loss/crossentropy": 1.9440131187438965, + "loss/hidden": 0.01971435546875, + "loss/logits": 0.002431391447316855, + "step": 66 + }, + { + "epoch": 0.067, + "grad_norm": 0.498046875, + "grad_norm_var": 0.10036614735921225, + "learning_rate": 1.3400000000000002e-05, + "loss": 0.0241, + "loss/crossentropy": 1.7777947187423706, + "loss/hidden": 0.02142333984375, + "loss/logits": 0.0026856372132897377, + "step": 67 + }, + { + "epoch": 0.068, + "grad_norm": 0.66015625, + "grad_norm_var": 0.09977563222249348, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.0241, + "loss/crossentropy": 1.6634170711040497, + "loss/hidden": 0.02178955078125, + "loss/logits": 0.002268874435685575, + "step": 68 + }, + { + "epoch": 0.069, + "grad_norm": 0.359375, + "grad_norm_var": 0.1039443333943685, + "learning_rate": 1.38e-05, + "loss": 0.0217, + "loss/crossentropy": 1.9945446252822876, + "loss/hidden": 0.019287109375, + "loss/logits": 0.0024602848570793867, + "step": 69 + }, + { + "epoch": 0.07, + "grad_norm": 0.546875, + "grad_norm_var": 0.10354207356770834, + "learning_rate": 1.4e-05, + "loss": 0.0212, + "loss/crossentropy": 2.234881281852722, + "loss/hidden": 0.0185546875, + "loss/logits": 0.0026649613864719868, + "step": 70 + }, + { + "epoch": 0.071, + "grad_norm": 0.5390625, + "grad_norm_var": 0.1000130812327067, + "learning_rate": 1.4200000000000001e-05, + "loss": 0.0235, + "loss/crossentropy": 2.3283374309539795, + "loss/hidden": 0.0206298828125, + "loss/logits": 0.0028440920868888497, + "step": 71 + }, + { + "epoch": 0.072, + "grad_norm": 0.96484375, + "grad_norm_var": 0.10530134836832682, + "learning_rate": 1.4400000000000001e-05, + "loss": 0.0273, + "loss/crossentropy": 2.446515917778015, + "loss/hidden": 0.0244140625, + "loss/logits": 0.002847215859219432, + "step": 72 + }, + { + "epoch": 0.073, + "grad_norm": 0.66015625, + "grad_norm_var": 0.10331465403238932, + "learning_rate": 1.46e-05, + "loss": 0.0313, + "loss/crossentropy": 1.8365015387535095, + "loss/hidden": 0.0277099609375, + "loss/logits": 0.003543111262843013, + "step": 73 + }, + { + "epoch": 0.074, + "grad_norm": 0.58203125, + "grad_norm_var": 0.0997507095336914, + "learning_rate": 1.48e-05, + "loss": 0.0275, + "loss/crossentropy": 1.8750606179237366, + "loss/hidden": 0.0244140625, + "loss/logits": 0.0030850095208734274, + "step": 74 + }, + { + "epoch": 0.075, + "grad_norm": 0.6171875, + "grad_norm_var": 0.03528436024983724, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.0285, + "loss/crossentropy": 1.6197695136070251, + "loss/hidden": 0.02557373046875, + "loss/logits": 0.002948817447759211, + "step": 75 + }, + { + "epoch": 0.076, + "grad_norm": 0.5546875, + "grad_norm_var": 0.034586191177368164, + "learning_rate": 1.5200000000000002e-05, + "loss": 0.0253, + "loss/crossentropy": 2.139370322227478, + "loss/hidden": 0.0225830078125, + "loss/logits": 0.002709153341129422, + "step": 76 + }, + { + "epoch": 0.077, + "grad_norm": 0.78125, + "grad_norm_var": 0.020085255304972332, + "learning_rate": 1.54e-05, + "loss": 0.0308, + "loss/crossentropy": 1.5335928797721863, + "loss/hidden": 0.02777099609375, + "loss/logits": 0.00305762467905879, + "step": 77 + }, + { + "epoch": 0.078, + "grad_norm": 0.5078125, + "grad_norm_var": 0.019349145889282226, + "learning_rate": 1.5600000000000003e-05, + "loss": 0.0273, + "loss/crossentropy": 2.623558282852173, + "loss/hidden": 0.024169921875, + "loss/logits": 0.0031643210677430034, + "step": 78 + }, + { + "epoch": 0.079, + "grad_norm": 0.470703125, + "grad_norm_var": 0.019434547424316405, + "learning_rate": 1.58e-05, + "loss": 0.0275, + "loss/crossentropy": 2.3246337175369263, + "loss/hidden": 0.0242919921875, + "loss/logits": 0.0031679703388363123, + "step": 79 + }, + { + "epoch": 0.08, + "grad_norm": 0.431640625, + "grad_norm_var": 0.0201418399810791, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.0254, + "loss/crossentropy": 1.801970660686493, + "loss/hidden": 0.0228271484375, + "loss/logits": 0.0025987064000219107, + "step": 80 + }, + { + "epoch": 0.081, + "grad_norm": 0.44921875, + "grad_norm_var": 0.021184905370076498, + "learning_rate": 1.62e-05, + "loss": 0.0265, + "loss/crossentropy": 1.9489317536354065, + "loss/hidden": 0.02374267578125, + "loss/logits": 0.0027701087528839707, + "step": 81 + }, + { + "epoch": 0.082, + "grad_norm": 0.67578125, + "grad_norm_var": 0.02180479367574056, + "learning_rate": 1.64e-05, + "loss": 0.034, + "loss/crossentropy": 1.7697851061820984, + "loss/hidden": 0.03070068359375, + "loss/logits": 0.003283574478700757, + "step": 82 + }, + { + "epoch": 0.083, + "grad_norm": 0.57421875, + "grad_norm_var": 0.021323140462239584, + "learning_rate": 1.66e-05, + "loss": 0.0309, + "loss/crossentropy": 1.5783970654010773, + "loss/hidden": 0.028076171875, + "loss/logits": 0.002809713245369494, + "step": 83 + }, + { + "epoch": 0.084, + "grad_norm": 0.53125, + "grad_norm_var": 0.02108605702718099, + "learning_rate": 1.6800000000000002e-05, + "loss": 0.0332, + "loss/crossentropy": 1.460361659526825, + "loss/hidden": 0.0303955078125, + "loss/logits": 0.0027706819819286466, + "step": 84 + }, + { + "epoch": 0.085, + "grad_norm": 0.6015625, + "grad_norm_var": 0.017696062723795574, + "learning_rate": 1.7e-05, + "loss": 0.0324, + "loss/crossentropy": 2.1110434532165527, + "loss/hidden": 0.02911376953125, + "loss/logits": 0.0033112409291788936, + "step": 85 + }, + { + "epoch": 0.086, + "grad_norm": 0.451171875, + "grad_norm_var": 0.018857304255167642, + "learning_rate": 1.72e-05, + "loss": 0.0291, + "loss/crossentropy": 1.7163687944412231, + "loss/hidden": 0.02630615234375, + "loss/logits": 0.0027680074563249946, + "step": 86 + }, + { + "epoch": 0.087, + "grad_norm": 0.5703125, + "grad_norm_var": 0.018718449274698894, + "learning_rate": 1.7400000000000003e-05, + "loss": 0.0339, + "loss/crossentropy": 1.8893783688545227, + "loss/hidden": 0.03021240234375, + "loss/logits": 0.0037144168745726347, + "step": 87 + }, + { + "epoch": 0.088, + "grad_norm": 1.75, + "grad_norm_var": 0.0965951124827067, + "learning_rate": 1.76e-05, + "loss": 0.0293, + "loss/crossentropy": 1.0857177823781967, + "loss/hidden": 0.02716064453125, + "loss/logits": 0.002114512084517628, + "step": 88 + }, + { + "epoch": 0.089, + "grad_norm": 0.4609375, + "grad_norm_var": 0.09848872820536296, + "learning_rate": 1.7800000000000002e-05, + "loss": 0.0278, + "loss/crossentropy": 2.1670188307762146, + "loss/hidden": 0.0250244140625, + "loss/logits": 0.0027708488050848246, + "step": 89 + }, + { + "epoch": 0.09, + "grad_norm": 2.984375, + "grad_norm_var": 0.4452332655588786, + "learning_rate": 1.8e-05, + "loss": 0.034, + "loss/crossentropy": 0.8697951380163431, + "loss/hidden": 0.0322265625, + "loss/logits": 0.0017659573932178319, + "step": 90 + }, + { + "epoch": 0.091, + "grad_norm": 0.58984375, + "grad_norm_var": 0.44585811297098793, + "learning_rate": 1.8200000000000002e-05, + "loss": 0.0315, + "loss/crossentropy": 2.0653520226478577, + "loss/hidden": 0.02813720703125, + "loss/logits": 0.003313788794912398, + "step": 91 + }, + { + "epoch": 0.092, + "grad_norm": 0.66015625, + "grad_norm_var": 0.44346858660380045, + "learning_rate": 1.8400000000000003e-05, + "loss": 0.0352, + "loss/crossentropy": 2.1175276041030884, + "loss/hidden": 0.0318603515625, + "loss/logits": 0.003378898836672306, + "step": 92 + }, + { + "epoch": 0.093, + "grad_norm": 0.478515625, + "grad_norm_var": 0.44917195638020835, + "learning_rate": 1.86e-05, + "loss": 0.0328, + "loss/crossentropy": 2.192784309387207, + "loss/hidden": 0.029296875, + "loss/logits": 0.003497788915410638, + "step": 93 + }, + { + "epoch": 0.094, + "grad_norm": 0.50390625, + "grad_norm_var": 0.4493051528930664, + "learning_rate": 1.88e-05, + "loss": 0.0342, + "loss/crossentropy": 1.8000940680503845, + "loss/hidden": 0.0308837890625, + "loss/logits": 0.003295119386166334, + "step": 94 + }, + { + "epoch": 0.095, + "grad_norm": 0.86328125, + "grad_norm_var": 0.44371743202209474, + "learning_rate": 1.9e-05, + "loss": 0.0376, + "loss/crossentropy": 1.9514374732971191, + "loss/hidden": 0.0340576171875, + "loss/logits": 0.0035327656660228968, + "step": 95 + }, + { + "epoch": 0.096, + "grad_norm": 0.55859375, + "grad_norm_var": 0.4387262980143229, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.0334, + "loss/crossentropy": 1.7834157943725586, + "loss/hidden": 0.03021240234375, + "loss/logits": 0.003167669870890677, + "step": 96 + }, + { + "epoch": 0.097, + "grad_norm": 0.71484375, + "grad_norm_var": 0.4309270222981771, + "learning_rate": 1.94e-05, + "loss": 0.0327, + "loss/crossentropy": 1.6889591813087463, + "loss/hidden": 0.02972412109375, + "loss/logits": 0.0029616469983011484, + "step": 97 + }, + { + "epoch": 0.098, + "grad_norm": 0.56640625, + "grad_norm_var": 0.4336400349934896, + "learning_rate": 1.9600000000000002e-05, + "loss": 0.0354, + "loss/crossentropy": 1.7813147902488708, + "loss/hidden": 0.031982421875, + "loss/logits": 0.003417789936065674, + "step": 98 + }, + { + "epoch": 0.099, + "grad_norm": 0.9140625, + "grad_norm_var": 0.43045953114827473, + "learning_rate": 1.98e-05, + "loss": 0.0376, + "loss/crossentropy": 1.3951178789138794, + "loss/hidden": 0.0345458984375, + "loss/logits": 0.0030310061993077397, + "step": 99 + }, + { + "epoch": 0.1, + "grad_norm": 0.56640625, + "grad_norm_var": 0.4291600545247396, + "learning_rate": 2e-05, + "loss": 0.0364, + "loss/crossentropy": 2.255498170852661, + "loss/hidden": 0.03277587890625, + "loss/logits": 0.0036420804681256413, + "step": 100 + }, + { + "epoch": 0.101, + "grad_norm": 0.58984375, + "grad_norm_var": 0.429521115620931, + "learning_rate": 2e-05, + "loss": 0.033, + "loss/crossentropy": 2.4104394912719727, + "loss/hidden": 0.02960205078125, + "loss/logits": 0.0033488960471004248, + "step": 101 + }, + { + "epoch": 0.102, + "grad_norm": 4.8125, + "grad_norm_var": 1.4001366774241129, + "learning_rate": 2e-05, + "loss": 0.0477, + "loss/crossentropy": 1.0830636993050575, + "loss/hidden": 0.0452880859375, + "loss/logits": 0.0023841604124754667, + "step": 102 + }, + { + "epoch": 0.103, + "grad_norm": 4.1875, + "grad_norm_var": 1.9629084110260009, + "learning_rate": 2e-05, + "loss": 0.0475, + "loss/crossentropy": 0.7437883876264095, + "loss/hidden": 0.0455322265625, + "loss/logits": 0.0019981139339506626, + "step": 103 + }, + { + "epoch": 0.104, + "grad_norm": 0.77734375, + "grad_norm_var": 1.9669294834136963, + "learning_rate": 2e-05, + "loss": 0.0387, + "loss/crossentropy": 2.1284059882164, + "loss/hidden": 0.0345458984375, + "loss/logits": 0.00411223981063813, + "step": 104 + }, + { + "epoch": 0.105, + "grad_norm": 1.6796875, + "grad_norm_var": 1.92922043800354, + "learning_rate": 2e-05, + "loss": 0.0459, + "loss/crossentropy": 2.1119471192359924, + "loss/hidden": 0.0411376953125, + "loss/logits": 0.0047579677775502205, + "step": 105 + }, + { + "epoch": 0.106, + "grad_norm": 0.90234375, + "grad_norm_var": 1.7437895298004151, + "learning_rate": 2e-05, + "loss": 0.044, + "loss/crossentropy": 2.391239643096924, + "loss/hidden": 0.0390625, + "loss/logits": 0.004930721828714013, + "step": 106 + }, + { + "epoch": 0.107, + "grad_norm": 1.6875, + "grad_norm_var": 1.7282822767893473, + "learning_rate": 2e-05, + "loss": 0.0451, + "loss/crossentropy": 1.7602136731147766, + "loss/hidden": 0.040283203125, + "loss/logits": 0.004797366913408041, + "step": 107 + }, + { + "epoch": 0.108, + "grad_norm": 0.8828125, + "grad_norm_var": 1.7130108992258708, + "learning_rate": 2e-05, + "loss": 0.0428, + "loss/crossentropy": 2.0745638012886047, + "loss/hidden": 0.0386962890625, + "loss/logits": 0.004113797098398209, + "step": 108 + }, + { + "epoch": 0.109, + "grad_norm": 0.82421875, + "grad_norm_var": 1.6829447428385416, + "learning_rate": 2e-05, + "loss": 0.0422, + "loss/crossentropy": 1.685157299041748, + "loss/hidden": 0.03857421875, + "loss/logits": 0.0036494951928034425, + "step": 109 + }, + { + "epoch": 0.11, + "grad_norm": 1.5703125, + "grad_norm_var": 1.6387715021769205, + "learning_rate": 2e-05, + "loss": 0.0376, + "loss/crossentropy": 2.625019073486328, + "loss/hidden": 0.03369140625, + "loss/logits": 0.0039150441298261285, + "step": 110 + }, + { + "epoch": 0.111, + "grad_norm": 1.5234375, + "grad_norm_var": 1.6204302469889322, + "learning_rate": 2e-05, + "loss": 0.0422, + "loss/crossentropy": 0.676440417766571, + "loss/hidden": 0.0401611328125, + "loss/logits": 0.0020512532209977508, + "step": 111 + }, + { + "epoch": 0.112, + "grad_norm": 0.65234375, + "grad_norm_var": 1.6101824442545574, + "learning_rate": 2e-05, + "loss": 0.0479, + "loss/crossentropy": 1.8928841352462769, + "loss/hidden": 0.0435791015625, + "loss/logits": 0.00434900657273829, + "step": 112 + }, + { + "epoch": 0.113, + "grad_norm": 1.09375, + "grad_norm_var": 1.5831150690714517, + "learning_rate": 2e-05, + "loss": 0.0498, + "loss/crossentropy": 1.2006176710128784, + "loss/hidden": 0.04638671875, + "loss/logits": 0.0034257903462275863, + "step": 113 + }, + { + "epoch": 0.114, + "grad_norm": 0.84375, + "grad_norm_var": 1.5551775614420573, + "learning_rate": 2e-05, + "loss": 0.0437, + "loss/crossentropy": 2.164067029953003, + "loss/hidden": 0.03955078125, + "loss/logits": 0.004164737183600664, + "step": 114 + }, + { + "epoch": 0.115, + "grad_norm": 0.875, + "grad_norm_var": 1.5581644694010417, + "learning_rate": 2e-05, + "loss": 0.0469, + "loss/crossentropy": 1.963140070438385, + "loss/hidden": 0.0419921875, + "loss/logits": 0.004867425188422203, + "step": 115 + }, + { + "epoch": 0.116, + "grad_norm": 0.83984375, + "grad_norm_var": 1.530010732014974, + "learning_rate": 2e-05, + "loss": 0.0469, + "loss/crossentropy": 1.936423420906067, + "loss/hidden": 0.04248046875, + "loss/logits": 0.004457900300621986, + "step": 116 + }, + { + "epoch": 0.117, + "grad_norm": 1.0, + "grad_norm_var": 1.4916320164998373, + "learning_rate": 2e-05, + "loss": 0.044, + "loss/crossentropy": 1.9027796387672424, + "loss/hidden": 0.0396728515625, + "loss/logits": 0.004306067014113069, + "step": 117 + }, + { + "epoch": 0.118, + "grad_norm": 0.921875, + "grad_norm_var": 0.724272092183431, + "learning_rate": 2e-05, + "loss": 0.048, + "loss/crossentropy": 1.4962169528007507, + "loss/hidden": 0.043212890625, + "loss/logits": 0.004831232130527496, + "step": 118 + }, + { + "epoch": 0.119, + "grad_norm": 1.3046875, + "grad_norm_var": 0.12087090810139973, + "learning_rate": 2e-05, + "loss": 0.0458, + "loss/crossentropy": 1.8558754324913025, + "loss/hidden": 0.04150390625, + "loss/logits": 0.004260358400642872, + "step": 119 + }, + { + "epoch": 0.12, + "grad_norm": 0.7421875, + "grad_norm_var": 0.12239583333333333, + "learning_rate": 2e-05, + "loss": 0.0467, + "loss/crossentropy": 2.163163900375366, + "loss/hidden": 0.042236328125, + "loss/logits": 0.0044949238654226065, + "step": 120 + }, + { + "epoch": 0.121, + "grad_norm": 0.66796875, + "grad_norm_var": 0.10601139068603516, + "learning_rate": 2e-05, + "loss": 0.0429, + "loss/crossentropy": 1.875292718410492, + "loss/hidden": 0.0389404296875, + "loss/logits": 0.003972187405452132, + "step": 121 + }, + { + "epoch": 0.122, + "grad_norm": 0.97265625, + "grad_norm_var": 0.1052103042602539, + "learning_rate": 2e-05, + "loss": 0.0504, + "loss/crossentropy": 1.581692636013031, + "loss/hidden": 0.0462646484375, + "loss/logits": 0.0040856958366930485, + "step": 122 + }, + { + "epoch": 0.123, + "grad_norm": 0.77734375, + "grad_norm_var": 0.07660497029622396, + "learning_rate": 2e-05, + "loss": 0.0467, + "loss/crossentropy": 2.185007333755493, + "loss/hidden": 0.0419921875, + "loss/logits": 0.0047312104143202305, + "step": 123 + }, + { + "epoch": 0.124, + "grad_norm": 0.70703125, + "grad_norm_var": 0.08053887685139974, + "learning_rate": 2e-05, + "loss": 0.0527, + "loss/crossentropy": 1.7746418118476868, + "loss/hidden": 0.0482177734375, + "loss/logits": 0.004488097038120031, + "step": 124 + }, + { + "epoch": 0.125, + "grad_norm": 0.82421875, + "grad_norm_var": 0.08053887685139974, + "learning_rate": 2e-05, + "loss": 0.0483, + "loss/crossentropy": 1.8139249682426453, + "loss/hidden": 0.044189453125, + "loss/logits": 0.00407675513997674, + "step": 125 + }, + { + "epoch": 0.126, + "grad_norm": 0.80078125, + "grad_norm_var": 0.05464986165364583, + "learning_rate": 2e-05, + "loss": 0.0536, + "loss/crossentropy": 1.8078742623329163, + "loss/hidden": 0.0489501953125, + "loss/logits": 0.004657944664359093, + "step": 126 + }, + { + "epoch": 0.127, + "grad_norm": 1.09375, + "grad_norm_var": 0.030997467041015626, + "learning_rate": 2e-05, + "loss": 0.0496, + "loss/crossentropy": 2.0267322659492493, + "loss/hidden": 0.0447998046875, + "loss/logits": 0.0047590641770511866, + "step": 127 + }, + { + "epoch": 0.128, + "grad_norm": 0.85546875, + "grad_norm_var": 0.027347564697265625, + "learning_rate": 2e-05, + "loss": 0.0587, + "loss/crossentropy": 1.6603793501853943, + "loss/hidden": 0.052978515625, + "loss/logits": 0.005712392507120967, + "step": 128 + }, + { + "epoch": 0.129, + "grad_norm": 5.375, + "grad_norm_var": 1.286358388264974, + "learning_rate": 2e-05, + "loss": 0.0577, + "loss/crossentropy": 0.8844976872205734, + "loss/hidden": 0.0550537109375, + "loss/logits": 0.0026012896560132504, + "step": 129 + }, + { + "epoch": 0.13, + "grad_norm": 0.94140625, + "grad_norm_var": 1.2828027725219726, + "learning_rate": 2e-05, + "loss": 0.0532, + "loss/crossentropy": 2.151723265647888, + "loss/hidden": 0.04833984375, + "loss/logits": 0.0048982377629727125, + "step": 130 + }, + { + "epoch": 0.131, + "grad_norm": 0.92578125, + "grad_norm_var": 1.280975341796875, + "learning_rate": 2e-05, + "loss": 0.048, + "loss/crossentropy": 2.190707802772522, + "loss/hidden": 0.0435791015625, + "loss/logits": 0.004458446754142642, + "step": 131 + }, + { + "epoch": 0.132, + "grad_norm": 0.73828125, + "grad_norm_var": 1.2861162821451824, + "learning_rate": 2e-05, + "loss": 0.0562, + "loss/crossentropy": 2.0854132175445557, + "loss/hidden": 0.0511474609375, + "loss/logits": 0.005020990269258618, + "step": 132 + }, + { + "epoch": 0.133, + "grad_norm": 0.6796875, + "grad_norm_var": 1.299598185221354, + "learning_rate": 2e-05, + "loss": 0.0509, + "loss/crossentropy": 2.0993438959121704, + "loss/hidden": 0.046142578125, + "loss/logits": 0.004787095822393894, + "step": 133 + }, + { + "epoch": 0.134, + "grad_norm": 0.96875, + "grad_norm_var": 1.2983378092447917, + "learning_rate": 2e-05, + "loss": 0.0491, + "loss/crossentropy": 2.2328933477401733, + "loss/hidden": 0.0445556640625, + "loss/logits": 0.004536583088338375, + "step": 134 + }, + { + "epoch": 0.135, + "grad_norm": 1.0625, + "grad_norm_var": 1.2969581604003906, + "learning_rate": 2e-05, + "loss": 0.0638, + "loss/crossentropy": 1.9981300234794617, + "loss/hidden": 0.0579833984375, + "loss/logits": 0.00582107319496572, + "step": 135 + }, + { + "epoch": 0.136, + "grad_norm": 0.6796875, + "grad_norm_var": 1.3004615783691407, + "learning_rate": 2e-05, + "loss": 0.0542, + "loss/crossentropy": 2.1993343830108643, + "loss/hidden": 0.049072265625, + "loss/logits": 0.005134769715368748, + "step": 136 + }, + { + "epoch": 0.137, + "grad_norm": 3.5, + "grad_norm_var": 1.627500343322754, + "learning_rate": 2e-05, + "loss": 0.0595, + "loss/crossentropy": 1.469780683517456, + "loss/hidden": 0.0552978515625, + "loss/logits": 0.0042177007999271154, + "step": 137 + }, + { + "epoch": 0.138, + "grad_norm": 0.87109375, + "grad_norm_var": 1.632664426167806, + "learning_rate": 2e-05, + "loss": 0.0554, + "loss/crossentropy": 1.8814529180526733, + "loss/hidden": 0.0506591796875, + "loss/logits": 0.004711252404376864, + "step": 138 + }, + { + "epoch": 0.139, + "grad_norm": 0.9140625, + "grad_norm_var": 1.62430419921875, + "learning_rate": 2e-05, + "loss": 0.0542, + "loss/crossentropy": 1.9769226908683777, + "loss/hidden": 0.049560546875, + "loss/logits": 0.004602615023031831, + "step": 139 + }, + { + "epoch": 0.14, + "grad_norm": 1.296875, + "grad_norm_var": 1.5987385431925456, + "learning_rate": 2e-05, + "loss": 0.0562, + "loss/crossentropy": 1.3646953105926514, + "loss/hidden": 0.0516357421875, + "loss/logits": 0.0045162534806877375, + "step": 140 + }, + { + "epoch": 0.141, + "grad_norm": 0.91796875, + "grad_norm_var": 1.592772356669108, + "learning_rate": 2e-05, + "loss": 0.0586, + "loss/crossentropy": 1.5901939272880554, + "loss/hidden": 0.0538330078125, + "loss/logits": 0.004788138438016176, + "step": 141 + }, + { + "epoch": 0.142, + "grad_norm": 1.109375, + "grad_norm_var": 1.5760719299316406, + "learning_rate": 2e-05, + "loss": 0.0686, + "loss/crossentropy": 1.8436982035636902, + "loss/hidden": 0.062744140625, + "loss/logits": 0.005897135473787785, + "step": 142 + }, + { + "epoch": 0.143, + "grad_norm": 1.0, + "grad_norm_var": 1.5800819396972656, + "learning_rate": 2e-05, + "loss": 0.0677, + "loss/crossentropy": 1.7922558188438416, + "loss/hidden": 0.06103515625, + "loss/logits": 0.006622593384236097, + "step": 143 + }, + { + "epoch": 0.144, + "grad_norm": 1.046875, + "grad_norm_var": 1.5693745295206705, + "learning_rate": 2e-05, + "loss": 0.0626, + "loss/crossentropy": 1.8654756546020508, + "loss/hidden": 0.05712890625, + "loss/logits": 0.005447414005175233, + "step": 144 + }, + { + "epoch": 0.145, + "grad_norm": 0.8046875, + "grad_norm_var": 0.43840071360270183, + "learning_rate": 2e-05, + "loss": 0.0653, + "loss/crossentropy": 2.023370146751404, + "loss/hidden": 0.0596923828125, + "loss/logits": 0.005567178362980485, + "step": 145 + }, + { + "epoch": 0.146, + "grad_norm": 1.7265625, + "grad_norm_var": 0.4612627665201823, + "learning_rate": 2e-05, + "loss": 0.0718, + "loss/crossentropy": 1.2652358412742615, + "loss/hidden": 0.066162109375, + "loss/logits": 0.00563872791826725, + "step": 146 + }, + { + "epoch": 0.147, + "grad_norm": 0.8359375, + "grad_norm_var": 0.4643350601196289, + "learning_rate": 2e-05, + "loss": 0.0579, + "loss/crossentropy": 2.181838572025299, + "loss/hidden": 0.0528564453125, + "loss/logits": 0.0050070807337760925, + "step": 147 + }, + { + "epoch": 0.148, + "grad_norm": 1.65625, + "grad_norm_var": 0.4685035705566406, + "learning_rate": 2e-05, + "loss": 0.0653, + "loss/crossentropy": 1.6760476231575012, + "loss/hidden": 0.059814453125, + "loss/logits": 0.005448109935969114, + "step": 148 + }, + { + "epoch": 0.149, + "grad_norm": 0.875, + "grad_norm_var": 0.45754903157552085, + "learning_rate": 2e-05, + "loss": 0.0608, + "loss/crossentropy": 1.9610846042633057, + "loss/hidden": 0.05517578125, + "loss/logits": 0.0055898819118738174, + "step": 149 + }, + { + "epoch": 0.15, + "grad_norm": 1.2890625, + "grad_norm_var": 0.45391006469726564, + "learning_rate": 2e-05, + "loss": 0.0607, + "loss/crossentropy": 2.0354663729667664, + "loss/hidden": 0.054931640625, + "loss/logits": 0.005750466603785753, + "step": 150 + }, + { + "epoch": 0.151, + "grad_norm": 0.91015625, + "grad_norm_var": 0.4586435317993164, + "learning_rate": 2e-05, + "loss": 0.061, + "loss/crossentropy": 1.5509551763534546, + "loss/hidden": 0.05615234375, + "loss/logits": 0.00486933346837759, + "step": 151 + }, + { + "epoch": 0.152, + "grad_norm": 1.8203125, + "grad_norm_var": 0.45860640207926434, + "learning_rate": 2e-05, + "loss": 0.0823, + "loss/crossentropy": 1.3190861344337463, + "loss/hidden": 0.076171875, + "loss/logits": 0.006146557629108429, + "step": 152 + }, + { + "epoch": 0.153, + "grad_norm": 1.65625, + "grad_norm_var": 0.12676741282145182, + "learning_rate": 2e-05, + "loss": 0.0689, + "loss/crossentropy": 2.0075970888137817, + "loss/hidden": 0.0626220703125, + "loss/logits": 0.0062951259315013885, + "step": 153 + }, + { + "epoch": 0.154, + "grad_norm": 0.84375, + "grad_norm_var": 0.12790629069010417, + "learning_rate": 2e-05, + "loss": 0.0645, + "loss/crossentropy": 2.5025904178619385, + "loss/hidden": 0.0584716796875, + "loss/logits": 0.005998906912282109, + "step": 154 + }, + { + "epoch": 0.155, + "grad_norm": 1.75, + "grad_norm_var": 0.14317194620768228, + "learning_rate": 2e-05, + "loss": 0.0673, + "loss/crossentropy": 1.7674061059951782, + "loss/hidden": 0.0618896484375, + "loss/logits": 0.005377188790589571, + "step": 155 + }, + { + "epoch": 0.156, + "grad_norm": 1.046875, + "grad_norm_var": 0.14455540974934897, + "learning_rate": 2e-05, + "loss": 0.0696, + "loss/crossentropy": 1.4891575574874878, + "loss/hidden": 0.0640869140625, + "loss/logits": 0.005491052754223347, + "step": 156 + }, + { + "epoch": 0.157, + "grad_norm": 1.0078125, + "grad_norm_var": 0.1416147232055664, + "learning_rate": 2e-05, + "loss": 0.0656, + "loss/crossentropy": 1.4295508861541748, + "loss/hidden": 0.060546875, + "loss/logits": 0.005026416387408972, + "step": 157 + }, + { + "epoch": 0.158, + "grad_norm": 8.5, + "grad_norm_var": 3.4551263809204102, + "learning_rate": 2e-05, + "loss": 0.1047, + "loss/crossentropy": 1.6207728683948517, + "loss/hidden": 0.09716796875, + "loss/logits": 0.007503823610022664, + "step": 158 + }, + { + "epoch": 0.159, + "grad_norm": 1.3125, + "grad_norm_var": 3.4331842422485352, + "learning_rate": 2e-05, + "loss": 0.0663, + "loss/crossentropy": 1.838720440864563, + "loss/hidden": 0.06103515625, + "loss/logits": 0.0052408319897949696, + "step": 159 + }, + { + "epoch": 0.16, + "grad_norm": 1.765625, + "grad_norm_var": 3.403587277730306, + "learning_rate": 2e-05, + "loss": 0.0729, + "loss/crossentropy": 1.9572261571884155, + "loss/hidden": 0.06640625, + "loss/logits": 0.00649917172268033, + "step": 160 + }, + { + "epoch": 0.161, + "grad_norm": 7.71875, + "grad_norm_var": 5.5313720067342125, + "learning_rate": 2e-05, + "loss": 0.0873, + "loss/crossentropy": 0.06751747522503138, + "loss/hidden": 0.086181640625, + "loss/logits": 0.001096382096875459, + "step": 161 + }, + { + "epoch": 0.162, + "grad_norm": 1.65625, + "grad_norm_var": 5.535835202534994, + "learning_rate": 2e-05, + "loss": 0.0753, + "loss/crossentropy": 1.9767259359359741, + "loss/hidden": 0.06884765625, + "loss/logits": 0.006433435715734959, + "step": 162 + }, + { + "epoch": 0.163, + "grad_norm": 1.2734375, + "grad_norm_var": 5.470252927144369, + "learning_rate": 2e-05, + "loss": 0.0742, + "loss/crossentropy": 1.6337787508964539, + "loss/hidden": 0.068359375, + "loss/logits": 0.0058679585345089436, + "step": 163 + }, + { + "epoch": 0.164, + "grad_norm": 1.171875, + "grad_norm_var": 5.519557634989421, + "learning_rate": 2e-05, + "loss": 0.0791, + "loss/crossentropy": 1.5085630416870117, + "loss/hidden": 0.0732421875, + "loss/logits": 0.00587455416098237, + "step": 164 + }, + { + "epoch": 0.165, + "grad_norm": 1.328125, + "grad_norm_var": 5.454612668355306, + "learning_rate": 2e-05, + "loss": 0.0733, + "loss/crossentropy": 2.1295101046562195, + "loss/hidden": 0.0665283203125, + "loss/logits": 0.006821601651608944, + "step": 165 + }, + { + "epoch": 0.166, + "grad_norm": 0.828125, + "grad_norm_var": 5.523303159077963, + "learning_rate": 2e-05, + "loss": 0.0681, + "loss/crossentropy": 2.1514192819595337, + "loss/hidden": 0.061767578125, + "loss/logits": 0.0063285790383815765, + "step": 166 + }, + { + "epoch": 0.167, + "grad_norm": 0.9140625, + "grad_norm_var": 5.522652180989583, + "learning_rate": 2e-05, + "loss": 0.0799, + "loss/crossentropy": 1.907168447971344, + "loss/hidden": 0.072509765625, + "loss/logits": 0.0073654367588460445, + "step": 167 + }, + { + "epoch": 0.168, + "grad_norm": 0.70703125, + "grad_norm_var": 5.650849850972493, + "learning_rate": 2e-05, + "loss": 0.0665, + "loss/crossentropy": 2.490573525428772, + "loss/hidden": 0.0604248046875, + "loss/logits": 0.006123463856056333, + "step": 168 + }, + { + "epoch": 0.169, + "grad_norm": 0.921875, + "grad_norm_var": 5.727275530497233, + "learning_rate": 2e-05, + "loss": 0.0686, + "loss/crossentropy": 2.1971182823181152, + "loss/hidden": 0.0625, + "loss/logits": 0.006081034895032644, + "step": 169 + }, + { + "epoch": 0.17, + "grad_norm": 0.84375, + "grad_norm_var": 5.727275530497233, + "learning_rate": 2e-05, + "loss": 0.0723, + "loss/crossentropy": 1.9449633955955505, + "loss/hidden": 0.06591796875, + "loss/logits": 0.00633727153763175, + "step": 170 + }, + { + "epoch": 0.171, + "grad_norm": 0.80078125, + "grad_norm_var": 5.8211313883463545, + "learning_rate": 2e-05, + "loss": 0.0721, + "loss/crossentropy": 1.8933625221252441, + "loss/hidden": 0.066162109375, + "loss/logits": 0.005927694728597999, + "step": 171 + }, + { + "epoch": 0.172, + "grad_norm": 0.734375, + "grad_norm_var": 5.8664194742838545, + "learning_rate": 2e-05, + "loss": 0.0756, + "loss/crossentropy": 2.2961581349372864, + "loss/hidden": 0.069091796875, + "loss/logits": 0.00650426116771996, + "step": 172 + }, + { + "epoch": 0.173, + "grad_norm": 1.0859375, + "grad_norm_var": 5.856801350911458, + "learning_rate": 2e-05, + "loss": 0.0876, + "loss/crossentropy": 1.5580723285675049, + "loss/hidden": 0.080322265625, + "loss/logits": 0.00728521216660738, + "step": 173 + }, + { + "epoch": 0.174, + "grad_norm": 0.87109375, + "grad_norm_var": 2.8547820409138995, + "learning_rate": 2e-05, + "loss": 0.0785, + "loss/crossentropy": 2.4996918439865112, + "loss/hidden": 0.07080078125, + "loss/logits": 0.0076872315257787704, + "step": 174 + }, + { + "epoch": 0.175, + "grad_norm": 1.09375, + "grad_norm_var": 2.863120460510254, + "learning_rate": 2e-05, + "loss": 0.0842, + "loss/crossentropy": 2.341306686401367, + "loss/hidden": 0.075927734375, + "loss/logits": 0.008260179311037064, + "step": 175 + }, + { + "epoch": 0.176, + "grad_norm": 1.2734375, + "grad_norm_var": 2.859659767150879, + "learning_rate": 2e-05, + "loss": 0.0839, + "loss/crossentropy": 2.0976521968841553, + "loss/hidden": 0.075927734375, + "loss/logits": 0.007956868037581444, + "step": 176 + }, + { + "epoch": 0.177, + "grad_norm": 1.6640625, + "grad_norm_var": 0.09129581451416016, + "learning_rate": 2e-05, + "loss": 0.0854, + "loss/crossentropy": 1.5655289888381958, + "loss/hidden": 0.078857421875, + "loss/logits": 0.006505638128146529, + "step": 177 + }, + { + "epoch": 0.178, + "grad_norm": 0.96484375, + "grad_norm_var": 0.06740493774414062, + "learning_rate": 2e-05, + "loss": 0.0832, + "loss/crossentropy": 1.947506844997406, + "loss/hidden": 0.076171875, + "loss/logits": 0.0070168147794902325, + "step": 178 + }, + { + "epoch": 0.179, + "grad_norm": 4.5625, + "grad_norm_var": 0.8503774007161459, + "learning_rate": 2e-05, + "loss": 0.0965, + "loss/crossentropy": 1.557403326034546, + "loss/hidden": 0.087158203125, + "loss/logits": 0.009354921989142895, + "step": 179 + }, + { + "epoch": 0.18, + "grad_norm": 8.3125, + "grad_norm_var": 3.9767252604166665, + "learning_rate": 2e-05, + "loss": 0.1122, + "loss/crossentropy": 0.45333431661129, + "loss/hidden": 0.109375, + "loss/logits": 0.0027967533096671104, + "step": 180 + }, + { + "epoch": 0.181, + "grad_norm": 1.546875, + "grad_norm_var": 3.969405110677083, + "learning_rate": 2e-05, + "loss": 0.0829, + "loss/crossentropy": 2.005882978439331, + "loss/hidden": 0.075439453125, + "loss/logits": 0.007453362224623561, + "step": 181 + }, + { + "epoch": 0.182, + "grad_norm": 1.3515625, + "grad_norm_var": 3.926006825764974, + "learning_rate": 2e-05, + "loss": 0.0849, + "loss/crossentropy": 2.199571132659912, + "loss/hidden": 0.077880859375, + "loss/logits": 0.0069826748222112656, + "step": 182 + }, + { + "epoch": 0.183, + "grad_norm": 1.5703125, + "grad_norm_var": 3.8817014058430988, + "learning_rate": 2e-05, + "loss": 0.0921, + "loss/crossentropy": 1.6926537156105042, + "loss/hidden": 0.085205078125, + "loss/logits": 0.006879956694319844, + "step": 183 + }, + { + "epoch": 0.184, + "grad_norm": 1.203125, + "grad_norm_var": 3.826835568745931, + "learning_rate": 2e-05, + "loss": 0.0964, + "loss/crossentropy": 1.509221613407135, + "loss/hidden": 0.087890625, + "loss/logits": 0.00847849901765585, + "step": 184 + }, + { + "epoch": 0.185, + "grad_norm": 0.703125, + "grad_norm_var": 3.8554396947224934, + "learning_rate": 2e-05, + "loss": 0.0788, + "loss/crossentropy": 2.4337867498397827, + "loss/hidden": 0.072021484375, + "loss/logits": 0.0067423065192997456, + "step": 185 + }, + { + "epoch": 0.186, + "grad_norm": 1.234375, + "grad_norm_var": 3.815881284077962, + "learning_rate": 2e-05, + "loss": 0.0966, + "loss/crossentropy": 1.7458332180976868, + "loss/hidden": 0.08837890625, + "loss/logits": 0.008262162329629064, + "step": 186 + }, + { + "epoch": 0.187, + "grad_norm": 6.59375, + "grad_norm_var": 5.133159383138021, + "learning_rate": 2e-05, + "loss": 0.0928, + "loss/crossentropy": 2.116236627101898, + "loss/hidden": 0.0830078125, + "loss/logits": 0.00975541677325964, + "step": 187 + }, + { + "epoch": 0.188, + "grad_norm": 1.8203125, + "grad_norm_var": 4.998583730061849, + "learning_rate": 2e-05, + "loss": 0.0831, + "loss/crossentropy": 2.324514389038086, + "loss/hidden": 0.075439453125, + "loss/logits": 0.007644579978659749, + "step": 188 + }, + { + "epoch": 0.189, + "grad_norm": 0.796875, + "grad_norm_var": 5.048313395182292, + "learning_rate": 2e-05, + "loss": 0.0867, + "loss/crossentropy": 1.9479625821113586, + "loss/hidden": 0.0791015625, + "loss/logits": 0.0075566458981484175, + "step": 189 + }, + { + "epoch": 0.19, + "grad_norm": 15.875, + "grad_norm_var": 16.414309628804524, + "learning_rate": 2e-05, + "loss": 0.1592, + "loss/crossentropy": 1.5863521695137024, + "loss/hidden": 0.1494140625, + "loss/logits": 0.009787225630134344, + "step": 190 + }, + { + "epoch": 0.191, + "grad_norm": 2.046875, + "grad_norm_var": 16.208450762430825, + "learning_rate": 2e-05, + "loss": 0.0784, + "loss/crossentropy": 0.8779918029904366, + "loss/hidden": 0.073974609375, + "loss/logits": 0.004391094436869025, + "step": 191 + }, + { + "epoch": 0.192, + "grad_norm": 1.375, + "grad_norm_var": 16.1827361424764, + "learning_rate": 2e-05, + "loss": 0.0931, + "loss/crossentropy": 2.1567060947418213, + "loss/hidden": 0.085693359375, + "loss/logits": 0.007449513301253319, + "step": 192 + }, + { + "epoch": 0.193, + "grad_norm": 0.875, + "grad_norm_var": 16.386012204488118, + "learning_rate": 2e-05, + "loss": 0.0898, + "loss/crossentropy": 1.8178179860115051, + "loss/hidden": 0.08251953125, + "loss/logits": 0.007294924231246114, + "step": 193 + }, + { + "epoch": 0.194, + "grad_norm": 2.21875, + "grad_norm_var": 16.114434560139973, + "learning_rate": 2e-05, + "loss": 0.1014, + "loss/crossentropy": 1.8806178569793701, + "loss/hidden": 0.09375, + "loss/logits": 0.0076924534514546394, + "step": 194 + }, + { + "epoch": 0.195, + "grad_norm": 1.8671875, + "grad_norm_var": 16.098729451497395, + "learning_rate": 2e-05, + "loss": 0.1048, + "loss/crossentropy": 1.6054936051368713, + "loss/hidden": 0.096435546875, + "loss/logits": 0.008354771416634321, + "step": 195 + }, + { + "epoch": 0.196, + "grad_norm": 1.90625, + "grad_norm_var": 14.200210571289062, + "learning_rate": 2e-05, + "loss": 0.0851, + "loss/crossentropy": 1.1937458366155624, + "loss/hidden": 0.079833984375, + "loss/logits": 0.005313969450071454, + "step": 196 + }, + { + "epoch": 0.197, + "grad_norm": 2.453125, + "grad_norm_var": 14.113833618164062, + "learning_rate": 2e-05, + "loss": 0.1056, + "loss/crossentropy": 1.9973903894424438, + "loss/hidden": 0.09619140625, + "loss/logits": 0.00938287889584899, + "step": 197 + }, + { + "epoch": 0.198, + "grad_norm": 1.5546875, + "grad_norm_var": 14.07872314453125, + "learning_rate": 2e-05, + "loss": 0.087, + "loss/crossentropy": 2.0422087907791138, + "loss/hidden": 0.07958984375, + "loss/logits": 0.007449948927387595, + "step": 198 + }, + { + "epoch": 0.199, + "grad_norm": 0.875, + "grad_norm_var": 14.218849436442058, + "learning_rate": 2e-05, + "loss": 0.0908, + "loss/crossentropy": 2.040232002735138, + "loss/hidden": 0.08349609375, + "loss/logits": 0.007334771566092968, + "step": 199 + }, + { + "epoch": 0.2, + "grad_norm": 3.6875, + "grad_norm_var": 14.104658762613932, + "learning_rate": 2e-05, + "loss": 0.0996, + "loss/crossentropy": 1.7977141737937927, + "loss/hidden": 0.09130859375, + "loss/logits": 0.008285259362310171, + "step": 200 + }, + { + "epoch": 0.201, + "grad_norm": 1.1640625, + "grad_norm_var": 13.984908040364584, + "learning_rate": 2e-05, + "loss": 0.0923, + "loss/crossentropy": 1.960830569267273, + "loss/hidden": 0.0849609375, + "loss/logits": 0.007373227505013347, + "step": 201 + }, + { + "epoch": 0.202, + "grad_norm": 1.2109375, + "grad_norm_var": 13.99013646443685, + "learning_rate": 2e-05, + "loss": 0.1063, + "loss/crossentropy": 1.5903997421264648, + "loss/hidden": 0.098876953125, + "loss/logits": 0.007376475026831031, + "step": 202 + }, + { + "epoch": 0.203, + "grad_norm": 2.015625, + "grad_norm_var": 13.0423215230306, + "learning_rate": 2e-05, + "loss": 0.0958, + "loss/crossentropy": 1.1866007596254349, + "loss/hidden": 0.0908203125, + "loss/logits": 0.0049855056568048894, + "step": 203 + }, + { + "epoch": 0.204, + "grad_norm": 2.203125, + "grad_norm_var": 13.01123046875, + "learning_rate": 2e-05, + "loss": 0.1001, + "loss/crossentropy": 2.016387164592743, + "loss/hidden": 0.092529296875, + "loss/logits": 0.0076178074814379215, + "step": 204 + }, + { + "epoch": 0.205, + "grad_norm": 0.98828125, + "grad_norm_var": 12.966665585835775, + "learning_rate": 2e-05, + "loss": 0.1017, + "loss/crossentropy": 1.9937080144882202, + "loss/hidden": 0.09326171875, + "loss/logits": 0.008388462010771036, + "step": 205 + }, + { + "epoch": 0.206, + "grad_norm": 1.65625, + "grad_norm_var": 0.5201679865519205, + "learning_rate": 2e-05, + "loss": 0.1012, + "loss/crossentropy": 1.8353246450424194, + "loss/hidden": 0.09326171875, + "loss/logits": 0.00795629364438355, + "step": 206 + }, + { + "epoch": 0.207, + "grad_norm": 1.6875, + "grad_norm_var": 0.5143070856730143, + "learning_rate": 2e-05, + "loss": 0.0918, + "loss/crossentropy": 1.0499791204929352, + "loss/hidden": 0.08740234375, + "loss/logits": 0.004438678151927888, + "step": 207 + }, + { + "epoch": 0.208, + "grad_norm": 1.0625, + "grad_norm_var": 0.5353540420532227, + "learning_rate": 2e-05, + "loss": 0.107, + "loss/crossentropy": 1.8614663481712341, + "loss/hidden": 0.09814453125, + "loss/logits": 0.008855776861310005, + "step": 208 + }, + { + "epoch": 0.209, + "grad_norm": 2.390625, + "grad_norm_var": 0.5093535741170248, + "learning_rate": 2e-05, + "loss": 0.1072, + "loss/crossentropy": 2.363565683364868, + "loss/hidden": 0.096923828125, + "loss/logits": 0.010271006729453802, + "step": 209 + }, + { + "epoch": 0.21, + "grad_norm": 2.171875, + "grad_norm_var": 0.5069289525349935, + "learning_rate": 2e-05, + "loss": 0.1086, + "loss/crossentropy": 1.955030083656311, + "loss/hidden": 0.099365234375, + "loss/logits": 0.0092296302318573, + "step": 210 + }, + { + "epoch": 0.211, + "grad_norm": 1.2265625, + "grad_norm_var": 0.5273447036743164, + "learning_rate": 2e-05, + "loss": 0.1062, + "loss/crossentropy": 1.774095892906189, + "loss/hidden": 0.0986328125, + "loss/logits": 0.007574398070573807, + "step": 211 + }, + { + "epoch": 0.212, + "grad_norm": 1.2890625, + "grad_norm_var": 0.5396000544230143, + "learning_rate": 2e-05, + "loss": 0.1117, + "loss/crossentropy": 1.8405153155326843, + "loss/hidden": 0.10302734375, + "loss/logits": 0.008719130419194698, + "step": 212 + }, + { + "epoch": 0.213, + "grad_norm": 1.40625, + "grad_norm_var": 0.5067829767862956, + "learning_rate": 2e-05, + "loss": 0.1045, + "loss/crossentropy": 2.0069875717163086, + "loss/hidden": 0.095947265625, + "loss/logits": 0.008583055343478918, + "step": 213 + }, + { + "epoch": 0.214, + "grad_norm": 1.1640625, + "grad_norm_var": 0.5219018936157227, + "learning_rate": 2e-05, + "loss": 0.1103, + "loss/crossentropy": 1.670526921749115, + "loss/hidden": 0.102294921875, + "loss/logits": 0.008038338739424944, + "step": 214 + }, + { + "epoch": 0.215, + "grad_norm": 1.8828125, + "grad_norm_var": 0.48292789459228513, + "learning_rate": 2e-05, + "loss": 0.1121, + "loss/crossentropy": 1.795514464378357, + "loss/hidden": 0.103759765625, + "loss/logits": 0.008318986743688583, + "step": 215 + }, + { + "epoch": 0.216, + "grad_norm": 1.1328125, + "grad_norm_var": 0.2139871597290039, + "learning_rate": 2e-05, + "loss": 0.1066, + "loss/crossentropy": 2.180332064628601, + "loss/hidden": 0.09716796875, + "loss/logits": 0.009391986764967442, + "step": 216 + }, + { + "epoch": 0.217, + "grad_norm": 1.9375, + "grad_norm_var": 0.21252689361572266, + "learning_rate": 2e-05, + "loss": 0.1234, + "loss/crossentropy": 1.8504464030265808, + "loss/hidden": 0.11181640625, + "loss/logits": 0.011583337560296059, + "step": 217 + }, + { + "epoch": 0.218, + "grad_norm": 1.046875, + "grad_norm_var": 0.22248172760009766, + "learning_rate": 2e-05, + "loss": 0.1098, + "loss/crossentropy": 1.6542016863822937, + "loss/hidden": 0.101806640625, + "loss/logits": 0.007953221211209893, + "step": 218 + }, + { + "epoch": 0.219, + "grad_norm": 1.1484375, + "grad_norm_var": 0.21898136138916016, + "learning_rate": 2e-05, + "loss": 0.1185, + "loss/crossentropy": 1.8401342630386353, + "loss/hidden": 0.107421875, + "loss/logits": 0.011056106071919203, + "step": 219 + }, + { + "epoch": 0.22, + "grad_norm": 1.2578125, + "grad_norm_var": 0.18931725819905598, + "learning_rate": 2e-05, + "loss": 0.1082, + "loss/crossentropy": 1.8265935778617859, + "loss/hidden": 0.09912109375, + "loss/logits": 0.009068313986063004, + "step": 220 + }, + { + "epoch": 0.221, + "grad_norm": 52.25, + "grad_norm_var": 161.16229426066081, + "learning_rate": 2e-05, + "loss": 0.1937, + "loss/crossentropy": 1.5437742471694946, + "loss/hidden": 0.170654296875, + "loss/logits": 0.023064299020916224, + "step": 221 + }, + { + "epoch": 0.222, + "grad_norm": 2.28125, + "grad_norm_var": 160.93560969034831, + "learning_rate": 2e-05, + "loss": 0.1246, + "loss/crossentropy": 1.227450430393219, + "loss/hidden": 0.11572265625, + "loss/logits": 0.008849140722304583, + "step": 222 + }, + { + "epoch": 0.223, + "grad_norm": 1.28125, + "grad_norm_var": 161.10956192016602, + "learning_rate": 2e-05, + "loss": 0.1196, + "loss/crossentropy": 1.9892451167106628, + "loss/hidden": 0.1103515625, + "loss/logits": 0.009212612174451351, + "step": 223 + }, + { + "epoch": 0.224, + "grad_norm": 1.0625, + "grad_norm_var": 161.10956192016602, + "learning_rate": 2e-05, + "loss": 0.1208, + "loss/crossentropy": 1.9727575778961182, + "loss/hidden": 0.111328125, + "loss/logits": 0.009519532322883606, + "step": 224 + }, + { + "epoch": 0.225, + "grad_norm": 1.9140625, + "grad_norm_var": 161.26942443847656, + "learning_rate": 2e-05, + "loss": 0.1112, + "loss/crossentropy": 2.20854651927948, + "loss/hidden": 0.1025390625, + "loss/logits": 0.008704130537807941, + "step": 225 + }, + { + "epoch": 0.226, + "grad_norm": 1.703125, + "grad_norm_var": 161.43824768066406, + "learning_rate": 2e-05, + "loss": 0.1249, + "loss/crossentropy": 1.8244708180427551, + "loss/hidden": 0.115478515625, + "loss/logits": 0.009438233450055122, + "step": 226 + }, + { + "epoch": 0.227, + "grad_norm": 1.9921875, + "grad_norm_var": 161.12805989583333, + "learning_rate": 2e-05, + "loss": 0.1264, + "loss/crossentropy": 1.6184683442115784, + "loss/hidden": 0.117431640625, + "loss/logits": 0.008998575620353222, + "step": 227 + }, + { + "epoch": 0.228, + "grad_norm": 1.40625, + "grad_norm_var": 161.0760617574056, + "learning_rate": 2e-05, + "loss": 0.1427, + "loss/crossentropy": 1.9090940952301025, + "loss/hidden": 0.1298828125, + "loss/logits": 0.01286676386371255, + "step": 228 + }, + { + "epoch": 0.229, + "grad_norm": 1.5078125, + "grad_norm_var": 161.03238525390626, + "learning_rate": 2e-05, + "loss": 0.1191, + "loss/crossentropy": 1.7622392773628235, + "loss/hidden": 0.109619140625, + "loss/logits": 0.009484861977398396, + "step": 229 + }, + { + "epoch": 0.23, + "grad_norm": 1.3671875, + "grad_norm_var": 160.93959045410156, + "learning_rate": 2e-05, + "loss": 0.1185, + "loss/crossentropy": 1.7633178234100342, + "loss/hidden": 0.109130859375, + "loss/logits": 0.009330280125141144, + "step": 230 + }, + { + "epoch": 0.231, + "grad_norm": 0.98828125, + "grad_norm_var": 161.32540073394776, + "learning_rate": 2e-05, + "loss": 0.1188, + "loss/crossentropy": 2.186140298843384, + "loss/hidden": 0.108154296875, + "loss/logits": 0.010631876531988382, + "step": 231 + }, + { + "epoch": 0.232, + "grad_norm": 3.28125, + "grad_norm_var": 160.60855553944904, + "learning_rate": 2e-05, + "loss": 0.1224, + "loss/crossentropy": 0.8389374911785126, + "loss/hidden": 0.1171875, + "loss/logits": 0.005214276316110045, + "step": 232 + }, + { + "epoch": 0.233, + "grad_norm": 1.0703125, + "grad_norm_var": 160.98382867177327, + "learning_rate": 2e-05, + "loss": 0.116, + "loss/crossentropy": 2.1515474915504456, + "loss/hidden": 0.107421875, + "loss/logits": 0.00860951654613018, + "step": 233 + }, + { + "epoch": 0.234, + "grad_norm": 4.5, + "grad_norm_var": 160.03680464426677, + "learning_rate": 2e-05, + "loss": 0.1312, + "loss/crossentropy": 1.6820667684078217, + "loss/hidden": 0.123046875, + "loss/logits": 0.008124232292175293, + "step": 234 + }, + { + "epoch": 0.235, + "grad_norm": 2.40625, + "grad_norm_var": 159.50010522206625, + "learning_rate": 2e-05, + "loss": 0.1056, + "loss/crossentropy": 0.9079534839838743, + "loss/hidden": 0.10107421875, + "loss/logits": 0.004542189242783934, + "step": 235 + }, + { + "epoch": 0.236, + "grad_norm": 0.984375, + "grad_norm_var": 159.64182631174722, + "learning_rate": 2e-05, + "loss": 0.1192, + "loss/crossentropy": 2.261181592941284, + "loss/hidden": 0.109619140625, + "loss/logits": 0.009581252932548523, + "step": 236 + }, + { + "epoch": 0.237, + "grad_norm": 0.9921875, + "grad_norm_var": 0.9261479059855143, + "learning_rate": 2e-05, + "loss": 0.1281, + "loss/crossentropy": 1.9553669095039368, + "loss/hidden": 0.116943359375, + "loss/logits": 0.011152476072311401, + "step": 237 + }, + { + "epoch": 0.238, + "grad_norm": 1.640625, + "grad_norm_var": 0.9103616714477539, + "learning_rate": 2e-05, + "loss": 0.1466, + "loss/crossentropy": 1.6360890865325928, + "loss/hidden": 0.13525390625, + "loss/logits": 0.011308418586850166, + "step": 238 + }, + { + "epoch": 0.239, + "grad_norm": 2.265625, + "grad_norm_var": 0.9085992813110352, + "learning_rate": 2e-05, + "loss": 0.133, + "loss/crossentropy": 1.0788212679326534, + "loss/hidden": 0.125732421875, + "loss/logits": 0.007256039883941412, + "step": 239 + }, + { + "epoch": 0.24, + "grad_norm": 1.7578125, + "grad_norm_var": 0.8688089370727539, + "learning_rate": 2e-05, + "loss": 0.1296, + "loss/crossentropy": 1.6809419393539429, + "loss/hidden": 0.119873046875, + "loss/logits": 0.009761545807123184, + "step": 240 + }, + { + "epoch": 0.241, + "grad_norm": 1.4921875, + "grad_norm_var": 0.8769525527954102, + "learning_rate": 2e-05, + "loss": 0.1298, + "loss/crossentropy": 2.1073160767555237, + "loss/hidden": 0.1201171875, + "loss/logits": 0.009713000617921352, + "step": 241 + }, + { + "epoch": 0.242, + "grad_norm": 3.3125, + "grad_norm_var": 1.0105956395467122, + "learning_rate": 2e-05, + "loss": 0.1851, + "loss/crossentropy": 1.7140259146690369, + "loss/hidden": 0.168212890625, + "loss/logits": 0.01692299358546734, + "step": 242 + }, + { + "epoch": 0.243, + "grad_norm": 1.3203125, + "grad_norm_var": 1.0337132136027019, + "learning_rate": 2e-05, + "loss": 0.141, + "loss/crossentropy": 1.70401269197464, + "loss/hidden": 0.13037109375, + "loss/logits": 0.010653213132172823, + "step": 243 + }, + { + "epoch": 0.244, + "grad_norm": 2.015625, + "grad_norm_var": 1.0173481623331706, + "learning_rate": 2e-05, + "loss": 0.1561, + "loss/crossentropy": 1.9086145758628845, + "loss/hidden": 0.1416015625, + "loss/logits": 0.01448416942730546, + "step": 244 + }, + { + "epoch": 0.245, + "grad_norm": 1.890625, + "grad_norm_var": 1.0048868179321289, + "learning_rate": 2e-05, + "loss": 0.1751, + "loss/crossentropy": 1.5015806555747986, + "loss/hidden": 0.16064453125, + "loss/logits": 0.014442750252783298, + "step": 245 + }, + { + "epoch": 0.246, + "grad_norm": 1.6796875, + "grad_norm_var": 0.9864847183227539, + "learning_rate": 2e-05, + "loss": 0.1323, + "loss/crossentropy": 1.9546470642089844, + "loss/hidden": 0.12255859375, + "loss/logits": 0.009766705334186554, + "step": 246 + }, + { + "epoch": 0.247, + "grad_norm": 1.203125, + "grad_norm_var": 0.9611083984375, + "learning_rate": 2e-05, + "loss": 0.1539, + "loss/crossentropy": 1.7062721848487854, + "loss/hidden": 0.1416015625, + "loss/logits": 0.01230617519468069, + "step": 247 + }, + { + "epoch": 0.248, + "grad_norm": 4.21875, + "grad_norm_var": 1.1776611328125, + "learning_rate": 2e-05, + "loss": 0.1515, + "loss/crossentropy": 1.740279734134674, + "loss/hidden": 0.14013671875, + "loss/logits": 0.011402689386159182, + "step": 248 + }, + { + "epoch": 0.249, + "grad_norm": 2.3125, + "grad_norm_var": 1.1123573303222656, + "learning_rate": 2e-05, + "loss": 0.1504, + "loss/crossentropy": 1.640882670879364, + "loss/hidden": 0.1396484375, + "loss/logits": 0.01071554934605956, + "step": 249 + }, + { + "epoch": 0.25, + "grad_norm": 2.796875, + "grad_norm_var": 0.7542132059733073, + "learning_rate": 2e-05, + "loss": 0.1364, + "loss/crossentropy": 1.4670004844665527, + "loss/hidden": 0.126708984375, + "loss/logits": 0.0096431621350348, + "step": 250 + }, + { + "epoch": 0.251, + "grad_norm": 1.1796875, + "grad_norm_var": 0.7847574869791667, + "learning_rate": 2e-05, + "loss": 0.14, + "loss/crossentropy": 2.2024736404418945, + "loss/hidden": 0.127197265625, + "loss/logits": 0.012759591452777386, + "step": 251 + }, + { + "epoch": 0.252, + "grad_norm": 3.53125, + "grad_norm_var": 0.8651763916015625, + "learning_rate": 2e-05, + "loss": 0.1539, + "loss/crossentropy": 2.0269722938537598, + "loss/hidden": 0.14208984375, + "loss/logits": 0.011817097198218107, + "step": 252 + }, + { + "epoch": 0.253, + "grad_norm": 9.375, + "grad_norm_var": 4.018281809488932, + "learning_rate": 2e-05, + "loss": 0.1661, + "loss/crossentropy": 0.34899202920496464, + "loss/hidden": 0.163818359375, + "loss/logits": 0.0022718849941156805, + "step": 253 + }, + { + "epoch": 0.254, + "grad_norm": 1.9921875, + "grad_norm_var": 3.9798868815104167, + "learning_rate": 2e-05, + "loss": 0.1441, + "loss/crossentropy": 2.2475985288619995, + "loss/hidden": 0.1318359375, + "loss/logits": 0.012224531266838312, + "step": 254 + }, + { + "epoch": 0.255, + "grad_norm": 1.6328125, + "grad_norm_var": 4.037050120035807, + "learning_rate": 2e-05, + "loss": 0.1497, + "loss/crossentropy": 2.8270416259765625, + "loss/hidden": 0.13623046875, + "loss/logits": 0.013480226043611765, + "step": 255 + }, + { + "epoch": 0.256, + "grad_norm": 1.4609375, + "grad_norm_var": 4.07616958618164, + "learning_rate": 2e-05, + "loss": 0.1668, + "loss/crossentropy": 1.3126854300498962, + "loss/hidden": 0.15576171875, + "loss/logits": 0.01107651786878705, + "step": 256 + }, + { + "epoch": 0.257, + "grad_norm": 1.9140625, + "grad_norm_var": 4.02563247680664, + "learning_rate": 2e-05, + "loss": 0.1502, + "loss/crossentropy": 1.4198355078697205, + "loss/hidden": 0.1396484375, + "loss/logits": 0.01056258101016283, + "step": 257 + }, + { + "epoch": 0.258, + "grad_norm": 1.3671875, + "grad_norm_var": 4.081167602539063, + "learning_rate": 2e-05, + "loss": 0.1421, + "loss/crossentropy": 1.657827377319336, + "loss/hidden": 0.13232421875, + "loss/logits": 0.009755304548889399, + "step": 258 + }, + { + "epoch": 0.259, + "grad_norm": 1.75, + "grad_norm_var": 4.025512440999349, + "learning_rate": 2e-05, + "loss": 0.1352, + "loss/crossentropy": 2.3775731325149536, + "loss/hidden": 0.12548828125, + "loss/logits": 0.0096644451841712, + "step": 259 + }, + { + "epoch": 0.26, + "grad_norm": 1.40625, + "grad_norm_var": 4.089703114827474, + "learning_rate": 2e-05, + "loss": 0.1442, + "loss/crossentropy": 2.2461366653442383, + "loss/hidden": 0.13232421875, + "loss/logits": 0.011895926669239998, + "step": 260 + }, + { + "epoch": 0.261, + "grad_norm": 2.578125, + "grad_norm_var": 4.065040842692057, + "learning_rate": 2e-05, + "loss": 0.1474, + "loss/crossentropy": 1.560776025056839, + "loss/hidden": 0.1337890625, + "loss/logits": 0.013578795362263918, + "step": 261 + }, + { + "epoch": 0.262, + "grad_norm": 1.5390625, + "grad_norm_var": 4.082124582926432, + "learning_rate": 2e-05, + "loss": 0.1556, + "loss/crossentropy": 1.9976117014884949, + "loss/hidden": 0.14404296875, + "loss/logits": 0.011512083932757378, + "step": 262 + }, + { + "epoch": 0.263, + "grad_norm": 1.6328125, + "grad_norm_var": 4.018440755208333, + "learning_rate": 2e-05, + "loss": 0.1759, + "loss/crossentropy": 1.705672264099121, + "loss/hidden": 0.16162109375, + "loss/logits": 0.014301342889666557, + "step": 263 + }, + { + "epoch": 0.264, + "grad_norm": 1.765625, + "grad_norm_var": 3.8464345296223956, + "learning_rate": 2e-05, + "loss": 0.1864, + "loss/crossentropy": 1.7075408101081848, + "loss/hidden": 0.171875, + "loss/logits": 0.01456779520958662, + "step": 264 + }, + { + "epoch": 0.265, + "grad_norm": 1.859375, + "grad_norm_var": 3.86392822265625, + "learning_rate": 2e-05, + "loss": 0.1677, + "loss/crossentropy": 2.094871759414673, + "loss/hidden": 0.15380859375, + "loss/logits": 0.013906504027545452, + "step": 265 + }, + { + "epoch": 0.266, + "grad_norm": 2.578125, + "grad_norm_var": 3.8542154947916667, + "learning_rate": 2e-05, + "loss": 0.1591, + "loss/crossentropy": 2.166890859603882, + "loss/hidden": 0.146484375, + "loss/logits": 0.012606294360011816, + "step": 266 + }, + { + "epoch": 0.267, + "grad_norm": 3.859375, + "grad_norm_var": 3.885705312093099, + "learning_rate": 2e-05, + "loss": 0.1763, + "loss/crossentropy": 1.674479365348816, + "loss/hidden": 0.162109375, + "loss/logits": 0.01416744152083993, + "step": 267 + }, + { + "epoch": 0.268, + "grad_norm": 2.625, + "grad_norm_var": 3.8142555236816404, + "learning_rate": 2e-05, + "loss": 0.2022, + "loss/crossentropy": 1.0146620571613312, + "loss/hidden": 0.1904296875, + "loss/logits": 0.01172702293843031, + "step": 268 + }, + { + "epoch": 0.269, + "grad_norm": 1.21875, + "grad_norm_var": 0.4503334045410156, + "learning_rate": 2e-05, + "loss": 0.1457, + "loss/crossentropy": 1.8024365305900574, + "loss/hidden": 0.13427734375, + "loss/logits": 0.011465264018625021, + "step": 269 + }, + { + "epoch": 0.27, + "grad_norm": 1.4296875, + "grad_norm_var": 0.46684951782226564, + "learning_rate": 2e-05, + "loss": 0.161, + "loss/crossentropy": 1.7421787977218628, + "loss/hidden": 0.14892578125, + "loss/logits": 0.012049074750393629, + "step": 270 + }, + { + "epoch": 0.271, + "grad_norm": 2.21875, + "grad_norm_var": 0.4663726806640625, + "learning_rate": 2e-05, + "loss": 0.1519, + "loss/crossentropy": 1.1601504981517792, + "loss/hidden": 0.14404296875, + "loss/logits": 0.007814974524080753, + "step": 271 + }, + { + "epoch": 0.272, + "grad_norm": 1.7421875, + "grad_norm_var": 0.4529693603515625, + "learning_rate": 2e-05, + "loss": 0.1693, + "loss/crossentropy": 1.9806629419326782, + "loss/hidden": 0.15625, + "loss/logits": 0.01302909990772605, + "step": 272 + }, + { + "epoch": 0.273, + "grad_norm": 1.1796875, + "grad_norm_var": 0.4919352213541667, + "learning_rate": 2e-05, + "loss": 0.1724, + "loss/crossentropy": 2.005366265773773, + "loss/hidden": 0.158203125, + "loss/logits": 0.014153223484754562, + "step": 273 + }, + { + "epoch": 0.274, + "grad_norm": 1.765625, + "grad_norm_var": 0.4723894755045573, + "learning_rate": 2e-05, + "loss": 0.1808, + "loss/crossentropy": 1.7814961075782776, + "loss/hidden": 0.166015625, + "loss/logits": 0.014784782659262419, + "step": 274 + }, + { + "epoch": 0.275, + "grad_norm": 1.9921875, + "grad_norm_var": 0.4697011311848958, + "learning_rate": 2e-05, + "loss": 0.1963, + "loss/crossentropy": 1.5670437216758728, + "loss/hidden": 0.1796875, + "loss/logits": 0.016570267733186483, + "step": 275 + }, + { + "epoch": 0.276, + "grad_norm": 1.4765625, + "grad_norm_var": 0.464800771077474, + "learning_rate": 2e-05, + "loss": 0.1604, + "loss/crossentropy": 2.009281039237976, + "loss/hidden": 0.1494140625, + "loss/logits": 0.010985464788973331, + "step": 276 + }, + { + "epoch": 0.277, + "grad_norm": 1.4453125, + "grad_norm_var": 0.45259501139322916, + "learning_rate": 2e-05, + "loss": 0.168, + "loss/crossentropy": 1.7085555791854858, + "loss/hidden": 0.15625, + "loss/logits": 0.011709913145750761, + "step": 277 + }, + { + "epoch": 0.278, + "grad_norm": 1.3828125, + "grad_norm_var": 0.46154683430989585, + "learning_rate": 2e-05, + "loss": 0.1456, + "loss/crossentropy": 2.789747476577759, + "loss/hidden": 0.1337890625, + "loss/logits": 0.011802888009697199, + "step": 278 + }, + { + "epoch": 0.279, + "grad_norm": 1.859375, + "grad_norm_var": 0.45711441040039064, + "learning_rate": 2e-05, + "loss": 0.1881, + "loss/crossentropy": 1.5918955504894257, + "loss/hidden": 0.173828125, + "loss/logits": 0.014291070867329836, + "step": 279 + }, + { + "epoch": 0.28, + "grad_norm": 2.8125, + "grad_norm_var": 0.5068682352701823, + "learning_rate": 2e-05, + "loss": 0.1458, + "loss/crossentropy": 0.8236657343804836, + "loss/hidden": 0.139404296875, + "loss/logits": 0.00643135339487344, + "step": 280 + }, + { + "epoch": 0.281, + "grad_norm": 4.125, + "grad_norm_var": 0.7956764221191406, + "learning_rate": 2e-05, + "loss": 0.1714, + "loss/crossentropy": 2.1279306411743164, + "loss/hidden": 0.15625, + "loss/logits": 0.015115040354430676, + "step": 281 + }, + { + "epoch": 0.282, + "grad_norm": 1.296875, + "grad_norm_var": 0.8177813212076823, + "learning_rate": 2e-05, + "loss": 0.1669, + "loss/crossentropy": 2.2272568941116333, + "loss/hidden": 0.1533203125, + "loss/logits": 0.0135371801443398, + "step": 282 + }, + { + "epoch": 0.283, + "grad_norm": 2.515625, + "grad_norm_var": 0.6023089090983073, + "learning_rate": 2e-05, + "loss": 0.1781, + "loss/crossentropy": 2.2013776302337646, + "loss/hidden": 0.16259765625, + "loss/logits": 0.015500886365771294, + "step": 283 + }, + { + "epoch": 0.284, + "grad_norm": 1.96875, + "grad_norm_var": 0.5695391337076823, + "learning_rate": 2e-05, + "loss": 0.1822, + "loss/crossentropy": 1.6315099596977234, + "loss/hidden": 0.1689453125, + "loss/logits": 0.013229990843683481, + "step": 284 + }, + { + "epoch": 0.285, + "grad_norm": 2.421875, + "grad_norm_var": 0.550426991780599, + "learning_rate": 2e-05, + "loss": 0.1877, + "loss/crossentropy": 1.329133152961731, + "loss/hidden": 0.1748046875, + "loss/logits": 0.012850106693804264, + "step": 285 + }, + { + "epoch": 0.286, + "grad_norm": 2.78125, + "grad_norm_var": 0.5659576416015625, + "learning_rate": 2e-05, + "loss": 0.1725, + "loss/crossentropy": 2.0431485772132874, + "loss/hidden": 0.15966796875, + "loss/logits": 0.01284833624958992, + "step": 286 + }, + { + "epoch": 0.287, + "grad_norm": 2.15625, + "grad_norm_var": 0.5648915608723958, + "learning_rate": 2e-05, + "loss": 0.2173, + "loss/crossentropy": 1.6292879581451416, + "loss/hidden": 0.19970703125, + "loss/logits": 0.017579292878508568, + "step": 287 + }, + { + "epoch": 0.288, + "grad_norm": 1.4296875, + "grad_norm_var": 0.5841379801432292, + "learning_rate": 2e-05, + "loss": 0.1632, + "loss/crossentropy": 2.0630630254745483, + "loss/hidden": 0.14990234375, + "loss/logits": 0.013251845724880695, + "step": 288 + }, + { + "epoch": 0.289, + "grad_norm": 1.8203125, + "grad_norm_var": 0.5364664713541667, + "learning_rate": 2e-05, + "loss": 0.2067, + "loss/crossentropy": 2.168562591075897, + "loss/hidden": 0.18798828125, + "loss/logits": 0.01867722487077117, + "step": 289 + }, + { + "epoch": 0.29, + "grad_norm": 1.21875, + "grad_norm_var": 0.5779449462890625, + "learning_rate": 2e-05, + "loss": 0.166, + "loss/crossentropy": 1.8953060507774353, + "loss/hidden": 0.15380859375, + "loss/logits": 0.01215141685679555, + "step": 290 + }, + { + "epoch": 0.291, + "grad_norm": 1.7109375, + "grad_norm_var": 0.5848297119140625, + "learning_rate": 2e-05, + "loss": 0.187, + "loss/crossentropy": 1.6148796081542969, + "loss/hidden": 0.173828125, + "loss/logits": 0.013202093075960875, + "step": 291 + }, + { + "epoch": 0.292, + "grad_norm": 1.6328125, + "grad_norm_var": 0.5749013264973958, + "learning_rate": 2e-05, + "loss": 0.197, + "loss/crossentropy": 1.7814635038375854, + "loss/hidden": 0.1826171875, + "loss/logits": 0.014429094269871712, + "step": 292 + }, + { + "epoch": 0.293, + "grad_norm": 2.015625, + "grad_norm_var": 0.5503028869628906, + "learning_rate": 2e-05, + "loss": 0.1814, + "loss/crossentropy": 2.1830875873565674, + "loss/hidden": 0.16748046875, + "loss/logits": 0.013968405313789845, + "step": 293 + }, + { + "epoch": 0.294, + "grad_norm": 1.7109375, + "grad_norm_var": 0.5268898010253906, + "learning_rate": 2e-05, + "loss": 0.2098, + "loss/crossentropy": 1.681401550769806, + "loss/hidden": 0.19482421875, + "loss/logits": 0.01494319923222065, + "step": 294 + }, + { + "epoch": 0.295, + "grad_norm": 1.3046875, + "grad_norm_var": 0.5633453369140625, + "learning_rate": 2e-05, + "loss": 0.1884, + "loss/crossentropy": 1.953886091709137, + "loss/hidden": 0.173828125, + "loss/logits": 0.014602533541619778, + "step": 295 + }, + { + "epoch": 0.296, + "grad_norm": 1.6875, + "grad_norm_var": 0.5292144775390625, + "learning_rate": 2e-05, + "loss": 0.1987, + "loss/crossentropy": 1.6944631338119507, + "loss/hidden": 0.18603515625, + "loss/logits": 0.012617598287761211, + "step": 296 + }, + { + "epoch": 0.297, + "grad_norm": 1.8359375, + "grad_norm_var": 0.20425999959309896, + "learning_rate": 2e-05, + "loss": 0.2261, + "loss/crossentropy": 2.214042544364929, + "loss/hidden": 0.205078125, + "loss/logits": 0.020975200459361076, + "step": 297 + }, + { + "epoch": 0.298, + "grad_norm": 1.1484375, + "grad_norm_var": 0.2164703369140625, + "learning_rate": 2e-05, + "loss": 0.1842, + "loss/crossentropy": 2.1237878799438477, + "loss/hidden": 0.16943359375, + "loss/logits": 0.014801782555878162, + "step": 298 + }, + { + "epoch": 0.299, + "grad_norm": 1.4765625, + "grad_norm_var": 0.18964818318684895, + "learning_rate": 2e-05, + "loss": 0.1814, + "loss/crossentropy": 1.492847979068756, + "loss/hidden": 0.16845703125, + "loss/logits": 0.012967187445610762, + "step": 299 + }, + { + "epoch": 0.3, + "grad_norm": 1.890625, + "grad_norm_var": 0.1879595438639323, + "learning_rate": 2e-05, + "loss": 0.1776, + "loss/crossentropy": 2.2924291491508484, + "loss/hidden": 0.16357421875, + "loss/logits": 0.014043833129107952, + "step": 300 + }, + { + "epoch": 0.301, + "grad_norm": 4.1875, + "grad_norm_var": 0.5374061584472656, + "learning_rate": 2e-05, + "loss": 0.2062, + "loss/crossentropy": 1.607342541217804, + "loss/hidden": 0.18994140625, + "loss/logits": 0.016273885034024715, + "step": 301 + }, + { + "epoch": 0.302, + "grad_norm": 1.5859375, + "grad_norm_var": 0.4823486328125, + "learning_rate": 2e-05, + "loss": 0.2143, + "loss/crossentropy": 1.8559609055519104, + "loss/hidden": 0.197265625, + "loss/logits": 0.017047187313437462, + "step": 302 + }, + { + "epoch": 0.303, + "grad_norm": 1.2265625, + "grad_norm_var": 0.4923052469889323, + "learning_rate": 2e-05, + "loss": 0.1814, + "loss/crossentropy": 2.4204115867614746, + "loss/hidden": 0.16796875, + "loss/logits": 0.013407074846327305, + "step": 303 + }, + { + "epoch": 0.304, + "grad_norm": 2.15625, + "grad_norm_var": 0.49497782389322914, + "learning_rate": 2e-05, + "loss": 0.2058, + "loss/crossentropy": 1.7306669354438782, + "loss/hidden": 0.189453125, + "loss/logits": 0.016323519870638847, + "step": 304 + }, + { + "epoch": 0.305, + "grad_norm": 1.6484375, + "grad_norm_var": 0.4960856119791667, + "learning_rate": 2e-05, + "loss": 0.1877, + "loss/crossentropy": 2.212082266807556, + "loss/hidden": 0.171875, + "loss/logits": 0.015811644960194826, + "step": 305 + }, + { + "epoch": 0.306, + "grad_norm": 1.3046875, + "grad_norm_var": 0.4901466369628906, + "learning_rate": 2e-05, + "loss": 0.1902, + "loss/crossentropy": 1.9250993132591248, + "loss/hidden": 0.17626953125, + "loss/logits": 0.013882125727832317, + "step": 306 + }, + { + "epoch": 0.307, + "grad_norm": 5.75, + "grad_norm_var": 1.4711181640625, + "learning_rate": 2e-05, + "loss": 0.1934, + "loss/crossentropy": 0.4879331737756729, + "loss/hidden": 0.18701171875, + "loss/logits": 0.006413323106244206, + "step": 307 + }, + { + "epoch": 0.308, + "grad_norm": 3.046875, + "grad_norm_var": 1.520232899983724, + "learning_rate": 2e-05, + "loss": 0.1973, + "loss/crossentropy": 1.4504847526550293, + "loss/hidden": 0.1875, + "loss/logits": 0.009785078698769212, + "step": 308 + }, + { + "epoch": 0.309, + "grad_norm": 1.40625, + "grad_norm_var": 1.5522092183430989, + "learning_rate": 2e-05, + "loss": 0.2057, + "loss/crossentropy": 2.149027943611145, + "loss/hidden": 0.189453125, + "loss/logits": 0.01620970480144024, + "step": 309 + }, + { + "epoch": 0.31, + "grad_norm": 1.7578125, + "grad_norm_var": 1.550005849202474, + "learning_rate": 2e-05, + "loss": 0.2027, + "loss/crossentropy": 2.1503273248672485, + "loss/hidden": 0.185546875, + "loss/logits": 0.01712088193744421, + "step": 310 + }, + { + "epoch": 0.311, + "grad_norm": 1.4375, + "grad_norm_var": 1.5372304280598958, + "learning_rate": 2e-05, + "loss": 0.1888, + "loss/crossentropy": 2.1748342514038086, + "loss/hidden": 0.17333984375, + "loss/logits": 0.01546872965991497, + "step": 311 + }, + { + "epoch": 0.312, + "grad_norm": 1.4921875, + "grad_norm_var": 1.5502703348795572, + "learning_rate": 2e-05, + "loss": 0.2158, + "loss/crossentropy": 1.3706732988357544, + "loss/hidden": 0.20166015625, + "loss/logits": 0.014161557890474796, + "step": 312 + }, + { + "epoch": 0.313, + "grad_norm": 2.421875, + "grad_norm_var": 1.5523111979166666, + "learning_rate": 2e-05, + "loss": 0.2021, + "loss/crossentropy": 1.8907567262649536, + "loss/hidden": 0.18701171875, + "loss/logits": 0.015071831177920103, + "step": 313 + }, + { + "epoch": 0.314, + "grad_norm": 1.296875, + "grad_norm_var": 1.5344378153483074, + "learning_rate": 2e-05, + "loss": 0.201, + "loss/crossentropy": 1.7888588905334473, + "loss/hidden": 0.1875, + "loss/logits": 0.013532605487853289, + "step": 314 + }, + { + "epoch": 0.315, + "grad_norm": 1.5859375, + "grad_norm_var": 1.5256507873535157, + "learning_rate": 2e-05, + "loss": 0.2166, + "loss/crossentropy": 1.5358025133609772, + "loss/hidden": 0.2021484375, + "loss/logits": 0.014410331379622221, + "step": 315 + }, + { + "epoch": 0.316, + "grad_norm": 56.0, + "grad_norm_var": 182.73569310506184, + "learning_rate": 2e-05, + "loss": 0.2529, + "loss/crossentropy": 2.1001065373420715, + "loss/hidden": 0.234375, + "loss/logits": 0.01847642147913575, + "step": 316 + }, + { + "epoch": 0.317, + "grad_norm": 1.296875, + "grad_norm_var": 183.77112401326497, + "learning_rate": 2e-05, + "loss": 0.1958, + "loss/crossentropy": 2.3731868267059326, + "loss/hidden": 0.1796875, + "loss/logits": 0.01615766156464815, + "step": 317 + }, + { + "epoch": 0.318, + "grad_norm": 1.53125, + "grad_norm_var": 183.79867248535157, + "learning_rate": 2e-05, + "loss": 0.2212, + "loss/crossentropy": 1.8716753125190735, + "loss/hidden": 0.2041015625, + "loss/logits": 0.017116894014179707, + "step": 318 + }, + { + "epoch": 0.319, + "grad_norm": 1.9921875, + "grad_norm_var": 183.41590983072916, + "learning_rate": 2e-05, + "loss": 0.1938, + "loss/crossentropy": 1.2205194532871246, + "loss/hidden": 0.18115234375, + "loss/logits": 0.012608660385012627, + "step": 319 + }, + { + "epoch": 0.32, + "grad_norm": 1.203125, + "grad_norm_var": 183.88273010253906, + "learning_rate": 2e-05, + "loss": 0.1822, + "loss/crossentropy": 2.3611029386520386, + "loss/hidden": 0.1689453125, + "loss/logits": 0.013240456581115723, + "step": 320 + }, + { + "epoch": 0.321, + "grad_norm": 1.3046875, + "grad_norm_var": 184.05854390462238, + "learning_rate": 2e-05, + "loss": 0.193, + "loss/crossentropy": 1.8402240872383118, + "loss/hidden": 0.18017578125, + "loss/logits": 0.012811433058232069, + "step": 321 + }, + { + "epoch": 0.322, + "grad_norm": 1.3671875, + "grad_norm_var": 184.02547912597657, + "learning_rate": 2e-05, + "loss": 0.2238, + "loss/crossentropy": 1.9131136536598206, + "loss/hidden": 0.20751953125, + "loss/logits": 0.016317113302648067, + "step": 322 + }, + { + "epoch": 0.323, + "grad_norm": 1.9765625, + "grad_norm_var": 184.69184951782228, + "learning_rate": 2e-05, + "loss": 0.2509, + "loss/crossentropy": 1.4010455012321472, + "loss/hidden": 0.23193359375, + "loss/logits": 0.018928353674709797, + "step": 323 + }, + { + "epoch": 0.324, + "grad_norm": 2.234375, + "grad_norm_var": 184.9522621154785, + "learning_rate": 2e-05, + "loss": 0.1929, + "loss/crossentropy": 1.9659223556518555, + "loss/hidden": 0.1796875, + "loss/logits": 0.013216304127126932, + "step": 324 + }, + { + "epoch": 0.325, + "grad_norm": 1.75, + "grad_norm_var": 184.79406102498373, + "learning_rate": 2e-05, + "loss": 0.1877, + "loss/crossentropy": 1.5221052765846252, + "loss/hidden": 0.17626953125, + "loss/logits": 0.011447824770584702, + "step": 325 + }, + { + "epoch": 0.326, + "grad_norm": 1.5546875, + "grad_norm_var": 184.88554662068685, + "learning_rate": 2e-05, + "loss": 0.2212, + "loss/crossentropy": 2.06081086397171, + "loss/hidden": 0.20361328125, + "loss/logits": 0.017567144706845284, + "step": 326 + }, + { + "epoch": 0.327, + "grad_norm": 3.578125, + "grad_norm_var": 184.14719823201497, + "learning_rate": 2e-05, + "loss": 0.1707, + "loss/crossentropy": 0.8908511102199554, + "loss/hidden": 0.1640625, + "loss/logits": 0.006589735276065767, + "step": 327 + }, + { + "epoch": 0.328, + "grad_norm": 2.1875, + "grad_norm_var": 183.83722737630208, + "learning_rate": 2e-05, + "loss": 0.2041, + "loss/crossentropy": 1.4793621897697449, + "loss/hidden": 0.19384765625, + "loss/logits": 0.010210367618128657, + "step": 328 + }, + { + "epoch": 0.329, + "grad_norm": 1.5703125, + "grad_norm_var": 184.19855931599935, + "learning_rate": 2e-05, + "loss": 0.2174, + "loss/crossentropy": 1.5629376769065857, + "loss/hidden": 0.20166015625, + "loss/logits": 0.015733799897134304, + "step": 329 + }, + { + "epoch": 0.33, + "grad_norm": 1.4609375, + "grad_norm_var": 184.11591389973958, + "learning_rate": 2e-05, + "loss": 0.2297, + "loss/crossentropy": 2.016783118247986, + "loss/hidden": 0.2119140625, + "loss/logits": 0.017778108827769756, + "step": 330 + }, + { + "epoch": 0.331, + "grad_norm": 1.3671875, + "grad_norm_var": 184.22320963541668, + "learning_rate": 2e-05, + "loss": 0.2183, + "loss/crossentropy": 2.3946865797042847, + "loss/hidden": 0.2001953125, + "loss/logits": 0.01807898748666048, + "step": 331 + }, + { + "epoch": 0.332, + "grad_norm": 1.234375, + "grad_norm_var": 0.35546773274739585, + "learning_rate": 2e-05, + "loss": 0.2244, + "loss/crossentropy": 1.6463975310325623, + "loss/hidden": 0.2099609375, + "loss/logits": 0.014466887805610895, + "step": 332 + }, + { + "epoch": 0.333, + "grad_norm": 1.703125, + "grad_norm_var": 0.34256083170572915, + "learning_rate": 2e-05, + "loss": 0.2653, + "loss/crossentropy": 1.727737307548523, + "loss/hidden": 0.24462890625, + "loss/logits": 0.020694734528660774, + "step": 333 + }, + { + "epoch": 0.334, + "grad_norm": 2.34375, + "grad_norm_var": 0.36001688639322915, + "learning_rate": 2e-05, + "loss": 0.2636, + "loss/crossentropy": 1.8381291031837463, + "loss/hidden": 0.244140625, + "loss/logits": 0.019478057511150837, + "step": 334 + }, + { + "epoch": 0.335, + "grad_norm": 5.5, + "grad_norm_var": 1.2181292215983073, + "learning_rate": 2e-05, + "loss": 0.2789, + "loss/crossentropy": 1.395434319972992, + "loss/hidden": 0.25732421875, + "loss/logits": 0.02152822446078062, + "step": 335 + }, + { + "epoch": 0.336, + "grad_norm": 1.7578125, + "grad_norm_var": 1.1768707275390624, + "learning_rate": 2e-05, + "loss": 0.2301, + "loss/crossentropy": 1.7802979946136475, + "loss/hidden": 0.212890625, + "loss/logits": 0.01717265695333481, + "step": 336 + }, + { + "epoch": 0.337, + "grad_norm": 1.2265625, + "grad_norm_var": 1.1850748697916667, + "learning_rate": 2e-05, + "loss": 0.2195, + "loss/crossentropy": 1.864999234676361, + "loss/hidden": 0.20361328125, + "loss/logits": 0.015909720212221146, + "step": 337 + }, + { + "epoch": 0.338, + "grad_norm": 1.5078125, + "grad_norm_var": 1.1734934488932292, + "learning_rate": 2e-05, + "loss": 0.2322, + "loss/crossentropy": 1.9171935319900513, + "loss/hidden": 0.2138671875, + "loss/logits": 0.01834118738770485, + "step": 338 + }, + { + "epoch": 0.339, + "grad_norm": 1.7109375, + "grad_norm_var": 1.1808430989583334, + "learning_rate": 2e-05, + "loss": 0.2546, + "loss/crossentropy": 2.232408821582794, + "loss/hidden": 0.23388671875, + "loss/logits": 0.02068551816046238, + "step": 339 + }, + { + "epoch": 0.34, + "grad_norm": 4.40625, + "grad_norm_var": 1.531086222330729, + "learning_rate": 2e-05, + "loss": 0.2209, + "loss/crossentropy": 0.885938722640276, + "loss/hidden": 0.2138671875, + "loss/logits": 0.0069831793662160635, + "step": 340 + }, + { + "epoch": 0.341, + "grad_norm": 1.8046875, + "grad_norm_var": 1.5281471252441405, + "learning_rate": 2e-05, + "loss": 0.274, + "loss/crossentropy": 2.053671360015869, + "loss/hidden": 0.25, + "loss/logits": 0.024039674550294876, + "step": 341 + }, + { + "epoch": 0.342, + "grad_norm": 1.4765625, + "grad_norm_var": 1.535064442952474, + "learning_rate": 2e-05, + "loss": 0.248, + "loss/crossentropy": 2.1628893613815308, + "loss/hidden": 0.22900390625, + "loss/logits": 0.01902489084750414, + "step": 342 + }, + { + "epoch": 0.343, + "grad_norm": 1.6875, + "grad_norm_var": 1.4053301493326822, + "learning_rate": 2e-05, + "loss": 0.2355, + "loss/crossentropy": 1.9784727692604065, + "loss/hidden": 0.216796875, + "loss/logits": 0.018667724914848804, + "step": 343 + }, + { + "epoch": 0.344, + "grad_norm": 1.9453125, + "grad_norm_var": 1.4048492431640625, + "learning_rate": 2e-05, + "loss": 0.2215, + "loss/crossentropy": 2.1430813670158386, + "loss/hidden": 0.205078125, + "loss/logits": 0.016372697427868843, + "step": 344 + }, + { + "epoch": 0.345, + "grad_norm": 3.34375, + "grad_norm_var": 1.489422353108724, + "learning_rate": 2e-05, + "loss": 0.2828, + "loss/crossentropy": 1.4574592113494873, + "loss/hidden": 0.259765625, + "loss/logits": 0.02300189435482025, + "step": 345 + }, + { + "epoch": 0.346, + "grad_norm": 4.59375, + "grad_norm_var": 1.8130035400390625, + "learning_rate": 2e-05, + "loss": 0.2555, + "loss/crossentropy": 2.1325125694274902, + "loss/hidden": 0.234375, + "loss/logits": 0.021130304783582687, + "step": 346 + }, + { + "epoch": 0.347, + "grad_norm": 1.4453125, + "grad_norm_var": 1.8031412760416667, + "learning_rate": 2e-05, + "loss": 0.233, + "loss/crossentropy": 2.6941460371017456, + "loss/hidden": 0.21435546875, + "loss/logits": 0.01859632506966591, + "step": 347 + }, + { + "epoch": 0.348, + "grad_norm": 1.6171875, + "grad_norm_var": 1.755077870686849, + "learning_rate": 2e-05, + "loss": 0.2562, + "loss/crossentropy": 1.8957814574241638, + "loss/hidden": 0.236328125, + "loss/logits": 0.019866405054926872, + "step": 348 + }, + { + "epoch": 0.349, + "grad_norm": 1.953125, + "grad_norm_var": 1.7364418029785156, + "learning_rate": 2e-05, + "loss": 0.2507, + "loss/crossentropy": 2.5658878087997437, + "loss/hidden": 0.2294921875, + "loss/logits": 0.02118699811398983, + "step": 349 + }, + { + "epoch": 0.35, + "grad_norm": 1.890625, + "grad_norm_var": 1.7523719787597656, + "learning_rate": 2e-05, + "loss": 0.233, + "loss/crossentropy": 1.9111933708190918, + "loss/hidden": 0.21533203125, + "loss/logits": 0.01770856324583292, + "step": 350 + }, + { + "epoch": 0.351, + "grad_norm": 2.625, + "grad_norm_var": 1.0678749084472656, + "learning_rate": 2e-05, + "loss": 0.2712, + "loss/crossentropy": 1.5525288581848145, + "loss/hidden": 0.25244140625, + "loss/logits": 0.01877846010029316, + "step": 351 + }, + { + "epoch": 0.352, + "grad_norm": 1.671875, + "grad_norm_var": 1.07325439453125, + "learning_rate": 2e-05, + "loss": 0.2398, + "loss/crossentropy": 1.47780179977417, + "loss/hidden": 0.224609375, + "loss/logits": 0.015163760632276535, + "step": 352 + }, + { + "epoch": 0.353, + "grad_norm": 1.40625, + "grad_norm_var": 1.0523902893066406, + "learning_rate": 2e-05, + "loss": 0.2579, + "loss/crossentropy": 1.6976242065429688, + "loss/hidden": 0.240234375, + "loss/logits": 0.01768268644809723, + "step": 353 + }, + { + "epoch": 0.354, + "grad_norm": 1.375, + "grad_norm_var": 1.065623982747396, + "learning_rate": 2e-05, + "loss": 0.2594, + "loss/crossentropy": 1.5402989983558655, + "loss/hidden": 0.24169921875, + "loss/logits": 0.017742513678967953, + "step": 354 + }, + { + "epoch": 0.355, + "grad_norm": 2.609375, + "grad_norm_var": 1.0593360900878905, + "learning_rate": 2e-05, + "loss": 0.2983, + "loss/crossentropy": 1.7891557812690735, + "loss/hidden": 0.2744140625, + "loss/logits": 0.023881751112639904, + "step": 355 + }, + { + "epoch": 0.356, + "grad_norm": 1.5859375, + "grad_norm_var": 0.7421427408854167, + "learning_rate": 2e-05, + "loss": 0.2353, + "loss/crossentropy": 2.255465269088745, + "loss/hidden": 0.2177734375, + "loss/logits": 0.01755282748490572, + "step": 356 + }, + { + "epoch": 0.357, + "grad_norm": 1.4375, + "grad_norm_var": 0.763287099202474, + "learning_rate": 2e-05, + "loss": 0.2388, + "loss/crossentropy": 2.2716734409332275, + "loss/hidden": 0.22021484375, + "loss/logits": 0.018602201715111732, + "step": 357 + }, + { + "epoch": 0.358, + "grad_norm": 2.34375, + "grad_norm_var": 0.7449666341145833, + "learning_rate": 2e-05, + "loss": 0.2737, + "loss/crossentropy": 1.8382077813148499, + "loss/hidden": 0.2548828125, + "loss/logits": 0.018825003411620855, + "step": 358 + }, + { + "epoch": 0.359, + "grad_norm": 1.5546875, + "grad_norm_var": 0.7532976786295573, + "learning_rate": 2e-05, + "loss": 0.2391, + "loss/crossentropy": 1.6230210661888123, + "loss/hidden": 0.224609375, + "loss/logits": 0.014487342443317175, + "step": 359 + }, + { + "epoch": 0.36, + "grad_norm": 1.421875, + "grad_norm_var": 0.7803385416666667, + "learning_rate": 2e-05, + "loss": 0.2519, + "loss/crossentropy": 1.6961406469345093, + "loss/hidden": 0.234375, + "loss/logits": 0.017499960027635098, + "step": 360 + }, + { + "epoch": 0.361, + "grad_norm": 1.5703125, + "grad_norm_var": 0.6720965067545573, + "learning_rate": 2e-05, + "loss": 0.2623, + "loss/crossentropy": 2.1821005940437317, + "loss/hidden": 0.24072265625, + "loss/logits": 0.021556712687015533, + "step": 361 + }, + { + "epoch": 0.362, + "grad_norm": 1.6484375, + "grad_norm_var": 0.17363688151041667, + "learning_rate": 2e-05, + "loss": 0.2759, + "loss/crossentropy": 1.7173206806182861, + "loss/hidden": 0.255859375, + "loss/logits": 0.020033356733620167, + "step": 362 + }, + { + "epoch": 0.363, + "grad_norm": 1.5859375, + "grad_norm_var": 0.16897684733072918, + "learning_rate": 2e-05, + "loss": 0.2552, + "loss/crossentropy": 1.8281689882278442, + "loss/hidden": 0.23681640625, + "loss/logits": 0.018404729664325714, + "step": 363 + }, + { + "epoch": 0.364, + "grad_norm": 1.3125, + "grad_norm_var": 0.1809282938639323, + "learning_rate": 2e-05, + "loss": 0.2546, + "loss/crossentropy": 2.181256651878357, + "loss/hidden": 0.23486328125, + "loss/logits": 0.01975287776440382, + "step": 364 + }, + { + "epoch": 0.365, + "grad_norm": 3.796875, + "grad_norm_var": 0.4434466044108073, + "learning_rate": 2e-05, + "loss": 0.2803, + "loss/crossentropy": 1.4486916065216064, + "loss/hidden": 0.2607421875, + "loss/logits": 0.01950985286384821, + "step": 365 + }, + { + "epoch": 0.366, + "grad_norm": 1.234375, + "grad_norm_var": 0.4680987040201823, + "learning_rate": 2e-05, + "loss": 0.2504, + "loss/crossentropy": 2.026048183441162, + "loss/hidden": 0.232421875, + "loss/logits": 0.017978372983634472, + "step": 366 + }, + { + "epoch": 0.367, + "grad_norm": 4.3125, + "grad_norm_var": 0.8263628641764323, + "learning_rate": 2e-05, + "loss": 0.2579, + "loss/crossentropy": 1.4382375180721283, + "loss/hidden": 0.2412109375, + "loss/logits": 0.016655512619763613, + "step": 367 + }, + { + "epoch": 0.368, + "grad_norm": 2.25, + "grad_norm_var": 0.827416737874349, + "learning_rate": 2e-05, + "loss": 0.3072, + "loss/crossentropy": 1.57509446144104, + "loss/hidden": 0.2880859375, + "loss/logits": 0.019162926822900772, + "step": 368 + }, + { + "epoch": 0.369, + "grad_norm": 1.78125, + "grad_norm_var": 0.808251698811849, + "learning_rate": 2e-05, + "loss": 0.2383, + "loss/crossentropy": 2.0060970187187195, + "loss/hidden": 0.22021484375, + "loss/logits": 0.018060280941426754, + "step": 369 + }, + { + "epoch": 0.37, + "grad_norm": 2.546875, + "grad_norm_var": 0.798180898030599, + "learning_rate": 2e-05, + "loss": 0.2523, + "loss/crossentropy": 1.2137621641159058, + "loss/hidden": 0.24072265625, + "loss/logits": 0.011561613995581865, + "step": 370 + }, + { + "epoch": 0.371, + "grad_norm": 1.703125, + "grad_norm_var": 0.7833717346191407, + "learning_rate": 2e-05, + "loss": 0.2561, + "loss/crossentropy": 1.764179289340973, + "loss/hidden": 0.240234375, + "loss/logits": 0.015869705006480217, + "step": 371 + }, + { + "epoch": 0.372, + "grad_norm": 1.5625, + "grad_norm_var": 0.784716796875, + "learning_rate": 2e-05, + "loss": 0.2642, + "loss/crossentropy": 2.1394487619400024, + "loss/hidden": 0.2451171875, + "loss/logits": 0.01907090563327074, + "step": 372 + }, + { + "epoch": 0.373, + "grad_norm": 1.9765625, + "grad_norm_var": 0.7621681213378906, + "learning_rate": 2e-05, + "loss": 0.2496, + "loss/crossentropy": 2.151320219039917, + "loss/hidden": 0.23095703125, + "loss/logits": 0.018605505116283894, + "step": 373 + }, + { + "epoch": 0.374, + "grad_norm": 1.5234375, + "grad_norm_var": 0.77073974609375, + "learning_rate": 2e-05, + "loss": 0.2426, + "loss/crossentropy": 2.291616916656494, + "loss/hidden": 0.2255859375, + "loss/logits": 0.01696862932294607, + "step": 374 + }, + { + "epoch": 0.375, + "grad_norm": 1.1640625, + "grad_norm_var": 0.8027577718098958, + "learning_rate": 2e-05, + "loss": 0.2482, + "loss/crossentropy": 2.1597548127174377, + "loss/hidden": 0.228515625, + "loss/logits": 0.019656311720609665, + "step": 375 + }, + { + "epoch": 0.376, + "grad_norm": 4.5625, + "grad_norm_var": 1.1930867513020833, + "learning_rate": 2e-05, + "loss": 0.2546, + "loss/crossentropy": 0.7966546472162008, + "loss/hidden": 0.24609375, + "loss/logits": 0.008532016014214605, + "step": 376 + }, + { + "epoch": 0.377, + "grad_norm": 1.25, + "grad_norm_var": 1.2246070861816407, + "learning_rate": 2e-05, + "loss": 0.2394, + "loss/crossentropy": 1.730500340461731, + "loss/hidden": 0.22314453125, + "loss/logits": 0.016217158176004887, + "step": 377 + }, + { + "epoch": 0.378, + "grad_norm": 1.9453125, + "grad_norm_var": 1.210729726155599, + "learning_rate": 2e-05, + "loss": 0.2672, + "loss/crossentropy": 2.0575554966926575, + "loss/hidden": 0.2470703125, + "loss/logits": 0.02009457629173994, + "step": 378 + }, + { + "epoch": 0.379, + "grad_norm": 4.15625, + "grad_norm_var": 1.4280181884765626, + "learning_rate": 2e-05, + "loss": 0.3649, + "loss/crossentropy": 2.409613251686096, + "loss/hidden": 0.330078125, + "loss/logits": 0.034814249724149704, + "step": 379 + }, + { + "epoch": 0.38, + "grad_norm": 2.34375, + "grad_norm_var": 1.3563140869140624, + "learning_rate": 2e-05, + "loss": 0.2651, + "loss/crossentropy": 1.4721761345863342, + "loss/hidden": 0.2490234375, + "loss/logits": 0.016095119062811136, + "step": 380 + }, + { + "epoch": 0.381, + "grad_norm": 1.2265625, + "grad_norm_var": 1.2842750549316406, + "learning_rate": 2e-05, + "loss": 0.2538, + "loss/crossentropy": 2.51900315284729, + "loss/hidden": 0.2314453125, + "loss/logits": 0.022326381877064705, + "step": 381 + }, + { + "epoch": 0.382, + "grad_norm": 2.234375, + "grad_norm_var": 1.2151995340983073, + "learning_rate": 2e-05, + "loss": 0.2743, + "loss/crossentropy": 2.030519187450409, + "loss/hidden": 0.2548828125, + "loss/logits": 0.01944338995963335, + "step": 382 + }, + { + "epoch": 0.383, + "grad_norm": 3.859375, + "grad_norm_var": 1.1054583231608073, + "learning_rate": 2e-05, + "loss": 0.3105, + "loss/crossentropy": 0.7516276463866234, + "loss/hidden": 0.2978515625, + "loss/logits": 0.012636175146326423, + "step": 383 + }, + { + "epoch": 0.384, + "grad_norm": 2.296875, + "grad_norm_var": 1.1055620829264323, + "learning_rate": 2e-05, + "loss": 0.2867, + "loss/crossentropy": 1.9317356944084167, + "loss/hidden": 0.263671875, + "loss/logits": 0.023075740784406662, + "step": 384 + }, + { + "epoch": 0.385, + "grad_norm": 2.09375, + "grad_norm_var": 1.0917884826660156, + "learning_rate": 2e-05, + "loss": 0.3261, + "loss/crossentropy": 2.1155296564102173, + "loss/hidden": 0.2998046875, + "loss/logits": 0.02629261091351509, + "step": 385 + }, + { + "epoch": 0.386, + "grad_norm": 1.7734375, + "grad_norm_var": 1.1014312744140624, + "learning_rate": 2e-05, + "loss": 0.287, + "loss/crossentropy": 2.1998232007026672, + "loss/hidden": 0.265625, + "loss/logits": 0.021336179226636887, + "step": 386 + }, + { + "epoch": 0.387, + "grad_norm": 1.8671875, + "grad_norm_var": 1.0915992736816407, + "learning_rate": 2e-05, + "loss": 0.2608, + "loss/crossentropy": 1.9437836408615112, + "loss/hidden": 0.2412109375, + "loss/logits": 0.019607914611697197, + "step": 387 + }, + { + "epoch": 0.388, + "grad_norm": 2.125, + "grad_norm_var": 1.0605812072753906, + "learning_rate": 2e-05, + "loss": 0.2871, + "loss/crossentropy": 1.7142232656478882, + "loss/hidden": 0.2666015625, + "loss/logits": 0.020461218431591988, + "step": 388 + }, + { + "epoch": 0.389, + "grad_norm": 1.640625, + "grad_norm_var": 1.0809977213541666, + "learning_rate": 2e-05, + "loss": 0.2863, + "loss/crossentropy": 2.236941933631897, + "loss/hidden": 0.2626953125, + "loss/logits": 0.023648610338568687, + "step": 389 + }, + { + "epoch": 0.39, + "grad_norm": 3.125, + "grad_norm_var": 1.0853248596191407, + "learning_rate": 2e-05, + "loss": 0.2733, + "loss/crossentropy": 1.2834028005599976, + "loss/hidden": 0.2607421875, + "loss/logits": 0.01257804874330759, + "step": 390 + }, + { + "epoch": 0.391, + "grad_norm": 1.5, + "grad_norm_var": 1.0390787760416667, + "learning_rate": 2e-05, + "loss": 0.3026, + "loss/crossentropy": 1.5867803692817688, + "loss/hidden": 0.2822265625, + "loss/logits": 0.020396556705236435, + "step": 391 + }, + { + "epoch": 0.392, + "grad_norm": 1.53125, + "grad_norm_var": 0.7292439778645833, + "learning_rate": 2e-05, + "loss": 0.297, + "loss/crossentropy": 1.4337636232376099, + "loss/hidden": 0.2783203125, + "loss/logits": 0.01866168435662985, + "step": 392 + }, + { + "epoch": 0.393, + "grad_norm": 1.71875, + "grad_norm_var": 0.6845052083333333, + "learning_rate": 2e-05, + "loss": 0.2642, + "loss/crossentropy": 2.1386572122573853, + "loss/hidden": 0.24462890625, + "loss/logits": 0.019583708606660366, + "step": 393 + }, + { + "epoch": 0.394, + "grad_norm": 2.9375, + "grad_norm_var": 0.710375722249349, + "learning_rate": 2e-05, + "loss": 0.3313, + "loss/crossentropy": 1.936402440071106, + "loss/hidden": 0.3046875, + "loss/logits": 0.026638174429535866, + "step": 394 + }, + { + "epoch": 0.395, + "grad_norm": 1.8671875, + "grad_norm_var": 0.4642567952473958, + "learning_rate": 2e-05, + "loss": 0.2699, + "loss/crossentropy": 2.2741682529449463, + "loss/hidden": 0.248046875, + "loss/logits": 0.021812792867422104, + "step": 395 + }, + { + "epoch": 0.396, + "grad_norm": 4.84375, + "grad_norm_var": 0.9248687744140625, + "learning_rate": 2e-05, + "loss": 0.3035, + "loss/crossentropy": 1.1322659850120544, + "loss/hidden": 0.291015625, + "loss/logits": 0.01252604997716844, + "step": 396 + }, + { + "epoch": 0.397, + "grad_norm": 2.53125, + "grad_norm_var": 0.8462562561035156, + "learning_rate": 2e-05, + "loss": 0.3108, + "loss/crossentropy": 1.358659565448761, + "loss/hidden": 0.2900390625, + "loss/logits": 0.02074052207171917, + "step": 397 + }, + { + "epoch": 0.398, + "grad_norm": 1.84375, + "grad_norm_var": 0.862939198811849, + "learning_rate": 2e-05, + "loss": 0.3, + "loss/crossentropy": 1.9806614518165588, + "loss/hidden": 0.2783203125, + "loss/logits": 0.02170161809772253, + "step": 398 + }, + { + "epoch": 0.399, + "grad_norm": 1.9296875, + "grad_norm_var": 0.706591796875, + "learning_rate": 2e-05, + "loss": 0.2984, + "loss/crossentropy": 2.3857691287994385, + "loss/hidden": 0.2744140625, + "loss/logits": 0.023968273773789406, + "step": 399 + }, + { + "epoch": 0.4, + "grad_norm": 1.9140625, + "grad_norm_var": 0.7121620178222656, + "learning_rate": 2e-05, + "loss": 0.2732, + "loss/crossentropy": 2.006265163421631, + "loss/hidden": 0.2509765625, + "loss/logits": 0.02220850996673107, + "step": 400 + }, + { + "epoch": 0.401, + "grad_norm": 1.8046875, + "grad_norm_var": 0.7215810139973958, + "learning_rate": 2e-05, + "loss": 0.2935, + "loss/crossentropy": 1.7221473455429077, + "loss/hidden": 0.275390625, + "loss/logits": 0.018067960627377033, + "step": 401 + }, + { + "epoch": 0.402, + "grad_norm": 2.421875, + "grad_norm_var": 0.7123146057128906, + "learning_rate": 2e-05, + "loss": 0.2923, + "loss/crossentropy": 2.0756383538246155, + "loss/hidden": 0.275390625, + "loss/logits": 0.016928995959460735, + "step": 402 + }, + { + "epoch": 0.403, + "grad_norm": 1.53125, + "grad_norm_var": 0.7353993733723958, + "learning_rate": 2e-05, + "loss": 0.2972, + "loss/crossentropy": 1.6683465242385864, + "loss/hidden": 0.2783203125, + "loss/logits": 0.018839839845895767, + "step": 403 + }, + { + "epoch": 0.404, + "grad_norm": 1.8125, + "grad_norm_var": 0.7447987874348958, + "learning_rate": 2e-05, + "loss": 0.2966, + "loss/crossentropy": 1.737410545349121, + "loss/hidden": 0.2763671875, + "loss/logits": 0.02023144531995058, + "step": 404 + }, + { + "epoch": 0.405, + "grad_norm": 1.3046875, + "grad_norm_var": 0.7762163798014323, + "learning_rate": 2e-05, + "loss": 0.2855, + "loss/crossentropy": 2.2183534502983093, + "loss/hidden": 0.26513671875, + "loss/logits": 0.02036190778017044, + "step": 405 + }, + { + "epoch": 0.406, + "grad_norm": 1.5, + "grad_norm_var": 0.7329465230305989, + "learning_rate": 2e-05, + "loss": 0.3193, + "loss/crossentropy": 1.8786720633506775, + "loss/hidden": 0.294921875, + "loss/logits": 0.024385149590671062, + "step": 406 + }, + { + "epoch": 0.407, + "grad_norm": 1.5, + "grad_norm_var": 0.7329465230305989, + "learning_rate": 2e-05, + "loss": 0.3099, + "loss/crossentropy": 1.8731706738471985, + "loss/hidden": 0.2861328125, + "loss/logits": 0.023721362464129925, + "step": 407 + }, + { + "epoch": 0.408, + "grad_norm": 1.953125, + "grad_norm_var": 0.714214833577474, + "learning_rate": 2e-05, + "loss": 0.2993, + "loss/crossentropy": 2.0363497734069824, + "loss/hidden": 0.2763671875, + "loss/logits": 0.02292494662106037, + "step": 408 + }, + { + "epoch": 0.409, + "grad_norm": 1.421875, + "grad_norm_var": 0.7343544006347656, + "learning_rate": 2e-05, + "loss": 0.2919, + "loss/crossentropy": 1.7596482038497925, + "loss/hidden": 0.2705078125, + "loss/logits": 0.021396052092313766, + "step": 409 + }, + { + "epoch": 0.41, + "grad_norm": 2.84375, + "grad_norm_var": 0.7240577697753906, + "learning_rate": 2e-05, + "loss": 0.3154, + "loss/crossentropy": 1.080414205789566, + "loss/hidden": 0.29736328125, + "loss/logits": 0.018078335095196962, + "step": 410 + }, + { + "epoch": 0.411, + "grad_norm": 1.5625, + "grad_norm_var": 0.73785400390625, + "learning_rate": 2e-05, + "loss": 0.2928, + "loss/crossentropy": 2.527972936630249, + "loss/hidden": 0.26953125, + "loss/logits": 0.02323300577700138, + "step": 411 + }, + { + "epoch": 0.412, + "grad_norm": 1.5078125, + "grad_norm_var": 0.18848851521809895, + "learning_rate": 2e-05, + "loss": 0.2989, + "loss/crossentropy": 1.5808929204940796, + "loss/hidden": 0.28125, + "loss/logits": 0.01763766910880804, + "step": 412 + }, + { + "epoch": 0.413, + "grad_norm": 1.6328125, + "grad_norm_var": 0.1557037353515625, + "learning_rate": 2e-05, + "loss": 0.3052, + "loss/crossentropy": 2.073564648628235, + "loss/hidden": 0.2841796875, + "loss/logits": 0.021017897874116898, + "step": 413 + }, + { + "epoch": 0.414, + "grad_norm": 1.703125, + "grad_norm_var": 0.15574951171875, + "learning_rate": 2e-05, + "loss": 0.3341, + "loss/crossentropy": 1.5968445539474487, + "loss/hidden": 0.310546875, + "loss/logits": 0.023572119884192944, + "step": 414 + }, + { + "epoch": 0.415, + "grad_norm": 1.65625, + "grad_norm_var": 0.15465469360351564, + "learning_rate": 2e-05, + "loss": 0.3319, + "loss/crossentropy": 2.13019335269928, + "loss/hidden": 0.3037109375, + "loss/logits": 0.028160166926681995, + "step": 415 + }, + { + "epoch": 0.416, + "grad_norm": 1.8828125, + "grad_norm_var": 0.15405044555664063, + "learning_rate": 2e-05, + "loss": 0.2928, + "loss/crossentropy": 1.3558663129806519, + "loss/hidden": 0.2744140625, + "loss/logits": 0.018423012923449278, + "step": 416 + }, + { + "epoch": 0.417, + "grad_norm": 2.15625, + "grad_norm_var": 0.1642242431640625, + "learning_rate": 2e-05, + "loss": 0.3349, + "loss/crossentropy": 1.556907832622528, + "loss/hidden": 0.310546875, + "loss/logits": 0.0243788855150342, + "step": 417 + }, + { + "epoch": 0.418, + "grad_norm": 1.765625, + "grad_norm_var": 0.1344879150390625, + "learning_rate": 2e-05, + "loss": 0.293, + "loss/crossentropy": 2.18166720867157, + "loss/hidden": 0.2705078125, + "loss/logits": 0.022501694969832897, + "step": 418 + }, + { + "epoch": 0.419, + "grad_norm": 5.0, + "grad_norm_var": 0.7930084228515625, + "learning_rate": 2e-05, + "loss": 0.306, + "loss/crossentropy": 1.875123679637909, + "loss/hidden": 0.2841796875, + "loss/logits": 0.021816120482981205, + "step": 419 + }, + { + "epoch": 0.42, + "grad_norm": 2.0, + "grad_norm_var": 0.7917633056640625, + "learning_rate": 2e-05, + "loss": 0.3207, + "loss/crossentropy": 2.1878353357315063, + "loss/hidden": 0.29296875, + "loss/logits": 0.027718784287571907, + "step": 420 + }, + { + "epoch": 0.421, + "grad_norm": 2.5, + "grad_norm_var": 0.7763160705566406, + "learning_rate": 2e-05, + "loss": 0.3106, + "loss/crossentropy": 2.46438992023468, + "loss/hidden": 0.2841796875, + "loss/logits": 0.026430321857333183, + "step": 421 + }, + { + "epoch": 0.422, + "grad_norm": 1.59375, + "grad_norm_var": 0.7701576232910157, + "learning_rate": 2e-05, + "loss": 0.2847, + "loss/crossentropy": 1.991809368133545, + "loss/hidden": 0.265625, + "loss/logits": 0.019083392806351185, + "step": 422 + }, + { + "epoch": 0.423, + "grad_norm": 2.421875, + "grad_norm_var": 0.7565935770670573, + "learning_rate": 2e-05, + "loss": 0.415, + "loss/crossentropy": 1.6859049797058105, + "loss/hidden": 0.3818359375, + "loss/logits": 0.03313039615750313, + "step": 423 + }, + { + "epoch": 0.424, + "grad_norm": 1.859375, + "grad_norm_var": 0.7589800516764323, + "learning_rate": 2e-05, + "loss": 0.3098, + "loss/crossentropy": 1.8961586952209473, + "loss/hidden": 0.2900390625, + "loss/logits": 0.019725864753127098, + "step": 424 + }, + { + "epoch": 0.425, + "grad_norm": 1.6171875, + "grad_norm_var": 0.7438547770182292, + "learning_rate": 2e-05, + "loss": 0.3427, + "loss/crossentropy": 2.085192084312439, + "loss/hidden": 0.31640625, + "loss/logits": 0.026326753199100494, + "step": 425 + }, + { + "epoch": 0.426, + "grad_norm": 2.078125, + "grad_norm_var": 0.705224609375, + "learning_rate": 2e-05, + "loss": 0.3321, + "loss/crossentropy": 1.912731111049652, + "loss/hidden": 0.3076171875, + "loss/logits": 0.02450721152126789, + "step": 426 + }, + { + "epoch": 0.427, + "grad_norm": 1.8359375, + "grad_norm_var": 0.6918108622233073, + "learning_rate": 2e-05, + "loss": 0.3396, + "loss/crossentropy": 2.1176230907440186, + "loss/hidden": 0.310546875, + "loss/logits": 0.029072879813611507, + "step": 427 + }, + { + "epoch": 0.428, + "grad_norm": 1.6015625, + "grad_norm_var": 0.6852617899576823, + "learning_rate": 2e-05, + "loss": 0.318, + "loss/crossentropy": 2.351975202560425, + "loss/hidden": 0.291015625, + "loss/logits": 0.026953624561429024, + "step": 428 + }, + { + "epoch": 0.429, + "grad_norm": 2.3125, + "grad_norm_var": 0.6734690348307292, + "learning_rate": 2e-05, + "loss": 0.4069, + "loss/crossentropy": 1.6036078929901123, + "loss/hidden": 0.37109375, + "loss/logits": 0.03581710997968912, + "step": 429 + }, + { + "epoch": 0.43, + "grad_norm": 2.46875, + "grad_norm_var": 0.667138671875, + "learning_rate": 2e-05, + "loss": 0.3472, + "loss/crossentropy": 1.881849765777588, + "loss/hidden": 0.3232421875, + "loss/logits": 0.023961665108799934, + "step": 430 + }, + { + "epoch": 0.431, + "grad_norm": 3.625, + "grad_norm_var": 0.77403564453125, + "learning_rate": 2e-05, + "loss": 0.3121, + "loss/crossentropy": 2.3671000599861145, + "loss/hidden": 0.2900390625, + "loss/logits": 0.022101588547229767, + "step": 431 + }, + { + "epoch": 0.432, + "grad_norm": 2.4375, + "grad_norm_var": 0.7627866109212239, + "learning_rate": 2e-05, + "loss": 0.3151, + "loss/crossentropy": 1.1575224101543427, + "loss/hidden": 0.298828125, + "loss/logits": 0.016257786191999912, + "step": 432 + }, + { + "epoch": 0.433, + "grad_norm": 5.40625, + "grad_norm_var": 1.3478289286295573, + "learning_rate": 2e-05, + "loss": 0.3283, + "loss/crossentropy": 1.3821857124567032, + "loss/hidden": 0.3115234375, + "loss/logits": 0.016785149462521076, + "step": 433 + }, + { + "epoch": 0.434, + "grad_norm": 2.140625, + "grad_norm_var": 1.3182634989420572, + "learning_rate": 2e-05, + "loss": 0.3499, + "loss/crossentropy": 1.4704007506370544, + "loss/hidden": 0.326171875, + "loss/logits": 0.02373607736080885, + "step": 434 + }, + { + "epoch": 0.435, + "grad_norm": 1.7890625, + "grad_norm_var": 0.9163736979166667, + "learning_rate": 2e-05, + "loss": 0.3453, + "loss/crossentropy": 1.7521992325782776, + "loss/hidden": 0.322265625, + "loss/logits": 0.023045840673148632, + "step": 435 + }, + { + "epoch": 0.436, + "grad_norm": 2.203125, + "grad_norm_var": 0.9093251546223958, + "learning_rate": 2e-05, + "loss": 0.3079, + "loss/crossentropy": 1.4147529304027557, + "loss/hidden": 0.2919921875, + "loss/logits": 0.01587154157459736, + "step": 436 + }, + { + "epoch": 0.437, + "grad_norm": 1.78125, + "grad_norm_var": 0.9289784749348958, + "learning_rate": 2e-05, + "loss": 0.3572, + "loss/crossentropy": 2.1589527130126953, + "loss/hidden": 0.330078125, + "loss/logits": 0.027110325172543526, + "step": 437 + }, + { + "epoch": 0.438, + "grad_norm": 1.546875, + "grad_norm_var": 0.9336751302083334, + "learning_rate": 2e-05, + "loss": 0.3112, + "loss/crossentropy": 2.0695826411247253, + "loss/hidden": 0.2890625, + "loss/logits": 0.022175450809299946, + "step": 438 + }, + { + "epoch": 0.439, + "grad_norm": 8.6875, + "grad_norm_var": 3.472150675455729, + "learning_rate": 2e-05, + "loss": 0.3174, + "loss/crossentropy": 2.715834140777588, + "loss/hidden": 0.2919921875, + "loss/logits": 0.02542768605053425, + "step": 439 + }, + { + "epoch": 0.44, + "grad_norm": 2.0625, + "grad_norm_var": 3.4516398111979165, + "learning_rate": 2e-05, + "loss": 0.3531, + "loss/crossentropy": 2.089130699634552, + "loss/hidden": 0.326171875, + "loss/logits": 0.026951050385832787, + "step": 440 + }, + { + "epoch": 0.441, + "grad_norm": 5.6875, + "grad_norm_var": 3.8860979715983075, + "learning_rate": 2e-05, + "loss": 0.352, + "loss/crossentropy": 1.6687681376934052, + "loss/hidden": 0.3330078125, + "loss/logits": 0.018973306752741337, + "step": 441 + }, + { + "epoch": 0.442, + "grad_norm": 1.6953125, + "grad_norm_var": 3.941239420572917, + "learning_rate": 2e-05, + "loss": 0.354, + "loss/crossentropy": 1.4019538760185242, + "loss/hidden": 0.33203125, + "loss/logits": 0.021962410770356655, + "step": 442 + }, + { + "epoch": 0.443, + "grad_norm": 2.453125, + "grad_norm_var": 3.8729509989420574, + "learning_rate": 2e-05, + "loss": 0.3591, + "loss/crossentropy": 2.068819046020508, + "loss/hidden": 0.328125, + "loss/logits": 0.03100405167788267, + "step": 443 + }, + { + "epoch": 0.444, + "grad_norm": 2.625, + "grad_norm_var": 3.7484527587890626, + "learning_rate": 2e-05, + "loss": 0.3207, + "loss/crossentropy": 1.2215966582298279, + "loss/hidden": 0.306640625, + "loss/logits": 0.014033652492798865, + "step": 444 + }, + { + "epoch": 0.445, + "grad_norm": 2.796875, + "grad_norm_var": 3.7149943033854167, + "learning_rate": 2e-05, + "loss": 0.2843, + "loss/crossentropy": 0.8393277078866959, + "loss/hidden": 0.2734375, + "loss/logits": 0.010860613780096173, + "step": 445 + }, + { + "epoch": 0.446, + "grad_norm": 3.6875, + "grad_norm_var": 3.7072184244791666, + "learning_rate": 2e-05, + "loss": 0.3369, + "loss/crossentropy": 0.8106656819581985, + "loss/hidden": 0.32421875, + "loss/logits": 0.01267361780628562, + "step": 446 + }, + { + "epoch": 0.447, + "grad_norm": 4.28125, + "grad_norm_var": 3.774466959635417, + "learning_rate": 2e-05, + "loss": 0.3246, + "loss/crossentropy": 1.0552468746900558, + "loss/hidden": 0.3095703125, + "loss/logits": 0.015042064245790243, + "step": 447 + }, + { + "epoch": 0.448, + "grad_norm": 2.734375, + "grad_norm_var": 3.749592081705729, + "learning_rate": 2e-05, + "loss": 0.3734, + "loss/crossentropy": 2.4344149827957153, + "loss/hidden": 0.3427734375, + "loss/logits": 0.030597456730902195, + "step": 448 + }, + { + "epoch": 0.449, + "grad_norm": 3.984375, + "grad_norm_var": 3.4621622721354166, + "learning_rate": 2e-05, + "loss": 0.3036, + "loss/crossentropy": 1.054320715367794, + "loss/hidden": 0.28857421875, + "loss/logits": 0.014980267733335495, + "step": 449 + }, + { + "epoch": 0.45, + "grad_norm": 1.8359375, + "grad_norm_var": 3.5083513895670575, + "learning_rate": 2e-05, + "loss": 0.3366, + "loss/crossentropy": 2.0155181288719177, + "loss/hidden": 0.310546875, + "loss/logits": 0.02600990142673254, + "step": 450 + }, + { + "epoch": 0.451, + "grad_norm": 2.0, + "grad_norm_var": 3.4738199869791666, + "learning_rate": 2e-05, + "loss": 0.3511, + "loss/crossentropy": 1.755088448524475, + "loss/hidden": 0.3271484375, + "loss/logits": 0.023935355246067047, + "step": 451 + }, + { + "epoch": 0.452, + "grad_norm": 2.046875, + "grad_norm_var": 3.4946329752604166, + "learning_rate": 2e-05, + "loss": 0.3499, + "loss/crossentropy": 1.7622599005699158, + "loss/hidden": 0.326171875, + "loss/logits": 0.023745747283101082, + "step": 452 + }, + { + "epoch": 0.453, + "grad_norm": 1.7890625, + "grad_norm_var": 3.4932431538899738, + "learning_rate": 2e-05, + "loss": 0.3215, + "loss/crossentropy": 2.3116530179977417, + "loss/hidden": 0.298828125, + "loss/logits": 0.022703303024172783, + "step": 453 + }, + { + "epoch": 0.454, + "grad_norm": 1.6875, + "grad_norm_var": 3.464989980061849, + "learning_rate": 2e-05, + "loss": 0.3673, + "loss/crossentropy": 1.5556917786598206, + "loss/hidden": 0.3408203125, + "loss/logits": 0.026494111865758896, + "step": 454 + }, + { + "epoch": 0.455, + "grad_norm": 2.0, + "grad_norm_var": 1.3033078511555989, + "learning_rate": 2e-05, + "loss": 0.3715, + "loss/crossentropy": 1.7844219207763672, + "loss/hidden": 0.345703125, + "loss/logits": 0.02580021321773529, + "step": 455 + }, + { + "epoch": 0.456, + "grad_norm": 2.53125, + "grad_norm_var": 1.2765439351399739, + "learning_rate": 2e-05, + "loss": 0.448, + "loss/crossentropy": 1.2347650527954102, + "loss/hidden": 0.4150390625, + "loss/logits": 0.0329879354685545, + "step": 456 + }, + { + "epoch": 0.457, + "grad_norm": 1.4375, + "grad_norm_var": 0.7350563049316406, + "learning_rate": 2e-05, + "loss": 0.3455, + "loss/crossentropy": 1.9715585112571716, + "loss/hidden": 0.318359375, + "loss/logits": 0.02718514297157526, + "step": 457 + }, + { + "epoch": 0.458, + "grad_norm": 1.5859375, + "grad_norm_var": 0.7471616109212239, + "learning_rate": 2e-05, + "loss": 0.3339, + "loss/crossentropy": 2.389525294303894, + "loss/hidden": 0.30859375, + "loss/logits": 0.025292156264185905, + "step": 458 + }, + { + "epoch": 0.459, + "grad_norm": 1.4921875, + "grad_norm_var": 0.8066884358723958, + "learning_rate": 2e-05, + "loss": 0.3166, + "loss/crossentropy": 1.7892733812332153, + "loss/hidden": 0.29296875, + "loss/logits": 0.023592060431838036, + "step": 459 + }, + { + "epoch": 0.46, + "grad_norm": 1.8125, + "grad_norm_var": 0.8243560791015625, + "learning_rate": 2e-05, + "loss": 0.3353, + "loss/crossentropy": 1.9092342853546143, + "loss/hidden": 0.3115234375, + "loss/logits": 0.02376522123813629, + "step": 460 + }, + { + "epoch": 0.461, + "grad_norm": 1.34375, + "grad_norm_var": 0.87099609375, + "learning_rate": 2e-05, + "loss": 0.349, + "loss/crossentropy": 1.9013403058052063, + "loss/hidden": 0.3251953125, + "loss/logits": 0.02381738182157278, + "step": 461 + }, + { + "epoch": 0.462, + "grad_norm": 2.9375, + "grad_norm_var": 0.76396484375, + "learning_rate": 2e-05, + "loss": 0.3492, + "loss/crossentropy": 0.9097070023417473, + "loss/hidden": 0.330078125, + "loss/logits": 0.01913693710230291, + "step": 462 + }, + { + "epoch": 0.463, + "grad_norm": 2.828125, + "grad_norm_var": 0.4963287353515625, + "learning_rate": 2e-05, + "loss": 0.4669, + "loss/crossentropy": 1.9413211345672607, + "loss/hidden": 0.427734375, + "loss/logits": 0.03912976011633873, + "step": 463 + }, + { + "epoch": 0.464, + "grad_norm": 1.9296875, + "grad_norm_var": 0.47173233032226564, + "learning_rate": 2e-05, + "loss": 0.3569, + "loss/crossentropy": 2.3746496438980103, + "loss/hidden": 0.326171875, + "loss/logits": 0.030762989073991776, + "step": 464 + }, + { + "epoch": 0.465, + "grad_norm": 1.796875, + "grad_norm_var": 0.21467259724934895, + "learning_rate": 2e-05, + "loss": 0.3875, + "loss/crossentropy": 1.920172929763794, + "loss/hidden": 0.359375, + "loss/logits": 0.028154666535556316, + "step": 465 + }, + { + "epoch": 0.466, + "grad_norm": 2.59375, + "grad_norm_var": 0.23995768229166667, + "learning_rate": 2e-05, + "loss": 0.4173, + "loss/crossentropy": 2.1804317831993103, + "loss/hidden": 0.3828125, + "loss/logits": 0.03448019549250603, + "step": 466 + }, + { + "epoch": 0.467, + "grad_norm": 2.453125, + "grad_norm_var": 0.25349833170572916, + "learning_rate": 2e-05, + "loss": 0.3635, + "loss/crossentropy": 2.1129865646362305, + "loss/hidden": 0.3369140625, + "loss/logits": 0.026613284833729267, + "step": 467 + }, + { + "epoch": 0.468, + "grad_norm": 3.4375, + "grad_norm_var": 0.37997639973958336, + "learning_rate": 2e-05, + "loss": 0.3892, + "loss/crossentropy": 1.6438812613487244, + "loss/hidden": 0.3623046875, + "loss/logits": 0.026910429820418358, + "step": 468 + }, + { + "epoch": 0.469, + "grad_norm": 13.125, + "grad_norm_var": 7.936161041259766, + "learning_rate": 2e-05, + "loss": 0.4187, + "loss/crossentropy": 1.8062403798103333, + "loss/hidden": 0.3857421875, + "loss/logits": 0.03291827440261841, + "step": 469 + }, + { + "epoch": 0.47, + "grad_norm": 3.421875, + "grad_norm_var": 7.8641212463378904, + "learning_rate": 2e-05, + "loss": 0.4157, + "loss/crossentropy": 1.2208881378173828, + "loss/hidden": 0.39453125, + "loss/logits": 0.02117818035185337, + "step": 470 + }, + { + "epoch": 0.471, + "grad_norm": 1.953125, + "grad_norm_var": 7.8700111389160154, + "learning_rate": 2e-05, + "loss": 0.3306, + "loss/crossentropy": 2.474324107170105, + "loss/hidden": 0.3037109375, + "loss/logits": 0.026909410022199154, + "step": 471 + }, + { + "epoch": 0.472, + "grad_norm": 2.796875, + "grad_norm_var": 7.860741933186849, + "learning_rate": 2e-05, + "loss": 0.4071, + "loss/crossentropy": 1.8907885551452637, + "loss/hidden": 0.3740234375, + "loss/logits": 0.03311134688556194, + "step": 472 + }, + { + "epoch": 0.473, + "grad_norm": 5.40625, + "grad_norm_var": 8.053236643473307, + "learning_rate": 2e-05, + "loss": 0.482, + "loss/crossentropy": 1.851112186908722, + "loss/hidden": 0.4287109375, + "loss/logits": 0.0532735763117671, + "step": 473 + }, + { + "epoch": 0.474, + "grad_norm": 1.8125, + "grad_norm_var": 8.008226521809895, + "learning_rate": 2e-05, + "loss": 0.4011, + "loss/crossentropy": 2.0893144607543945, + "loss/hidden": 0.37109375, + "loss/logits": 0.03000558167695999, + "step": 474 + }, + { + "epoch": 0.475, + "grad_norm": 1.84375, + "grad_norm_var": 7.936071523030599, + "learning_rate": 2e-05, + "loss": 0.4086, + "loss/crossentropy": 1.692557156085968, + "loss/hidden": 0.37890625, + "loss/logits": 0.029658248648047447, + "step": 475 + }, + { + "epoch": 0.476, + "grad_norm": 1.734375, + "grad_norm_var": 7.9510963439941404, + "learning_rate": 2e-05, + "loss": 0.3369, + "loss/crossentropy": 2.7231298685073853, + "loss/hidden": 0.3095703125, + "loss/logits": 0.027365448884665966, + "step": 476 + }, + { + "epoch": 0.477, + "grad_norm": 122.5, + "grad_norm_var": 895.1761065165202, + "learning_rate": 2e-05, + "loss": 1.8739, + "loss/crossentropy": 1.9931391477584839, + "loss/hidden": 1.73828125, + "loss/logits": 0.13565433584153652, + "step": 477 + }, + { + "epoch": 0.478, + "grad_norm": 18.75, + "grad_norm_var": 894.2567990620931, + "learning_rate": 2e-05, + "loss": 0.4467, + "loss/crossentropy": 1.0818050801753998, + "loss/hidden": 0.423828125, + "loss/logits": 0.022886332124471664, + "step": 478 + }, + { + "epoch": 0.479, + "grad_norm": 1.9609375, + "grad_norm_var": 895.3381581624349, + "learning_rate": 2e-05, + "loss": 0.3744, + "loss/crossentropy": 2.382234215736389, + "loss/hidden": 0.3447265625, + "loss/logits": 0.029717115685343742, + "step": 479 + }, + { + "epoch": 0.48, + "grad_norm": 1.71875, + "grad_norm_var": 895.6162839253743, + "learning_rate": 2e-05, + "loss": 0.3323, + "loss/crossentropy": 2.0683305859565735, + "loss/hidden": 0.30859375, + "loss/logits": 0.023680799640715122, + "step": 480 + }, + { + "epoch": 0.481, + "grad_norm": 2.546875, + "grad_norm_var": 894.6604733784993, + "learning_rate": 2e-05, + "loss": 0.3756, + "loss/crossentropy": 2.154377818107605, + "loss/hidden": 0.34765625, + "loss/logits": 0.02795298583805561, + "step": 481 + }, + { + "epoch": 0.482, + "grad_norm": 3.1875, + "grad_norm_var": 893.9573666890462, + "learning_rate": 2e-05, + "loss": 0.4124, + "loss/crossentropy": 1.9701088666915894, + "loss/hidden": 0.3779296875, + "loss/logits": 0.03451960347592831, + "step": 482 + }, + { + "epoch": 0.483, + "grad_norm": 2.5, + "grad_norm_var": 893.8991452534993, + "learning_rate": 2e-05, + "loss": 0.4523, + "loss/crossentropy": 0.9486123919487, + "loss/hidden": 0.4306640625, + "loss/logits": 0.02167674619704485, + "step": 483 + }, + { + "epoch": 0.484, + "grad_norm": 2.578125, + "grad_norm_var": 894.9027565002441, + "learning_rate": 2e-05, + "loss": 0.3955, + "loss/crossentropy": 1.7118502855300903, + "loss/hidden": 0.365234375, + "loss/logits": 0.030311796814203262, + "step": 484 + }, + { + "epoch": 0.485, + "grad_norm": 1.890625, + "grad_norm_var": 900.7159604390462, + "learning_rate": 2e-05, + "loss": 0.3914, + "loss/crossentropy": 1.7511045932769775, + "loss/hidden": 0.36328125, + "loss/logits": 0.02810109406709671, + "step": 485 + }, + { + "epoch": 0.486, + "grad_norm": 2.203125, + "grad_norm_var": 902.0463498433431, + "learning_rate": 2e-05, + "loss": 0.3893, + "loss/crossentropy": 1.9742628931999207, + "loss/hidden": 0.3603515625, + "loss/logits": 0.028935128822922707, + "step": 486 + }, + { + "epoch": 0.487, + "grad_norm": 2.609375, + "grad_norm_var": 901.28504002889, + "learning_rate": 2e-05, + "loss": 0.338, + "loss/crossentropy": 1.5944682955741882, + "loss/hidden": 0.31640625, + "loss/logits": 0.02155130822211504, + "step": 487 + }, + { + "epoch": 0.488, + "grad_norm": 2.0, + "grad_norm_var": 902.1965695699056, + "learning_rate": 2e-05, + "loss": 0.3749, + "loss/crossentropy": 2.109809994697571, + "loss/hidden": 0.3486328125, + "loss/logits": 0.026237317360937595, + "step": 488 + }, + { + "epoch": 0.489, + "grad_norm": 2.828125, + "grad_norm_var": 904.5185605367025, + "learning_rate": 2e-05, + "loss": 0.3601, + "loss/crossentropy": 2.371906280517578, + "loss/hidden": 0.33203125, + "loss/logits": 0.0280781090259552, + "step": 489 + }, + { + "epoch": 0.49, + "grad_norm": 2.25, + "grad_norm_var": 904.0067481994629, + "learning_rate": 2e-05, + "loss": 0.3881, + "loss/crossentropy": 2.3074965476989746, + "loss/hidden": 0.3583984375, + "loss/logits": 0.029700559563934803, + "step": 490 + }, + { + "epoch": 0.491, + "grad_norm": 1.609375, + "grad_norm_var": 904.2906532287598, + "learning_rate": 2e-05, + "loss": 0.3533, + "loss/crossentropy": 2.0604811906814575, + "loss/hidden": 0.3271484375, + "loss/logits": 0.026149596087634563, + "step": 491 + }, + { + "epoch": 0.492, + "grad_norm": 2.203125, + "grad_norm_var": 903.7375221252441, + "learning_rate": 2e-05, + "loss": 0.3982, + "loss/crossentropy": 2.0394086837768555, + "loss/hidden": 0.3671875, + "loss/logits": 0.030979415401816368, + "step": 492 + }, + { + "epoch": 0.493, + "grad_norm": 1.53125, + "grad_norm_var": 17.239774322509767, + "learning_rate": 2e-05, + "loss": 0.3721, + "loss/crossentropy": 1.992867350578308, + "loss/hidden": 0.3447265625, + "loss/logits": 0.02732760366052389, + "step": 493 + }, + { + "epoch": 0.494, + "grad_norm": 1.5, + "grad_norm_var": 0.24021377563476562, + "learning_rate": 2e-05, + "loss": 0.3607, + "loss/crossentropy": 2.0647668838500977, + "loss/hidden": 0.3349609375, + "loss/logits": 0.02573198452591896, + "step": 494 + }, + { + "epoch": 0.495, + "grad_norm": 3.265625, + "grad_norm_var": 0.3059153238932292, + "learning_rate": 2e-05, + "loss": 0.4332, + "loss/crossentropy": 2.0061678886413574, + "loss/hidden": 0.4033203125, + "loss/logits": 0.029847824946045876, + "step": 495 + }, + { + "epoch": 0.496, + "grad_norm": 1.671875, + "grad_norm_var": 0.30953776041666664, + "learning_rate": 2e-05, + "loss": 0.3677, + "loss/crossentropy": 2.029963493347168, + "loss/hidden": 0.3408203125, + "loss/logits": 0.026841914281249046, + "step": 496 + }, + { + "epoch": 0.497, + "grad_norm": 2.1875, + "grad_norm_var": 0.3045074462890625, + "learning_rate": 2e-05, + "loss": 0.3773, + "loss/crossentropy": 1.836094081401825, + "loss/hidden": 0.3505859375, + "loss/logits": 0.026703315787017345, + "step": 497 + }, + { + "epoch": 0.498, + "grad_norm": 1.8984375, + "grad_norm_var": 0.24739761352539064, + "learning_rate": 2e-05, + "loss": 0.3934, + "loss/crossentropy": 2.284022331237793, + "loss/hidden": 0.36328125, + "loss/logits": 0.030102317221462727, + "step": 498 + }, + { + "epoch": 0.499, + "grad_norm": 1.609375, + "grad_norm_var": 0.25783462524414064, + "learning_rate": 2e-05, + "loss": 0.422, + "loss/crossentropy": 1.7640503644943237, + "loss/hidden": 0.388671875, + "loss/logits": 0.03330034948885441, + "step": 499 + }, + { + "epoch": 0.5, + "grad_norm": 2.40625, + "grad_norm_var": 0.2490618387858073, + "learning_rate": 2e-05, + "loss": 0.4409, + "loss/crossentropy": 1.4432637095451355, + "loss/hidden": 0.4130859375, + "loss/logits": 0.027862844988703728, + "step": 500 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": true, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.2202930782208e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}