voice_clonning / train.log
narySt's picture
Upload folder using huggingface_hub
5b12713 verified
Epoch 0, Iteration 0, Loss: 7.5258, Loss AR: 6.9801, Loss CFM: 0.5458, Grad Norm: 6.9520, LR: 0.000000
Epoch 0, Iteration 10, Loss: 7.0105, Loss AR: 6.4175, Loss CFM: 0.5930, Grad Norm: 5.4397, LR: 0.000020
Epoch 0, Iteration 20, Loss: 6.7386, Loss AR: 6.1459, Loss CFM: 0.5928, Grad Norm: 5.7247, LR: 0.000020
Epoch 0, Iteration 30, Loss: 6.4623, Loss AR: 5.8612, Loss CFM: 0.6011, Grad Norm: 4.7703, LR: 0.000020
Epoch 0, Iteration 40, Loss: 6.1845, Loss AR: 5.6246, Loss CFM: 0.5598, Grad Norm: 7.5761, LR: 0.000020
Epoch 0, Iteration 50, Loss: 6.0636, Loss AR: 5.5900, Loss CFM: 0.4736, Grad Norm: 3.9460, LR: 0.000020
Epoch 0, Iteration 0, Loss: 7.7018, Loss AR: 6.9496, Loss CFM: 0.7522, Grad Norm: 8.8171, LR: 0.000000
Epoch 0, Iteration 10, Loss: 7.2448, Loss AR: 6.6907, Loss CFM: 0.5542, Grad Norm: 6.4648, LR: 0.000020
Epoch 0, Iteration 20, Loss: 6.7663, Loss AR: 6.1573, Loss CFM: 0.6090, Grad Norm: 8.0314, LR: 0.000020
Epoch 0, Iteration 30, Loss: 6.4644, Loss AR: 5.8628, Loss CFM: 0.6015, Grad Norm: 5.3914, LR: 0.000020
Epoch 0, Iteration 40, Loss: 6.3092, Loss AR: 5.7930, Loss CFM: 0.5162, Grad Norm: 5.8469, LR: 0.000020
Epoch 0, Iteration 50, Loss: 6.2213, Loss AR: 5.6550, Loss CFM: 0.5664, Grad Norm: 4.8429, LR: 0.000020
Epoch 0, Iteration 60, Loss: 6.0766, Loss AR: 5.5246, Loss CFM: 0.5520, Grad Norm: 7.4745, LR: 0.000020
Epoch 0, Iteration 70, Loss: 6.1670, Loss AR: 5.5973, Loss CFM: 0.5697, Grad Norm: 4.2736, LR: 0.000020
Epoch 0, Iteration 80, Loss: 5.9985, Loss AR: 5.3224, Loss CFM: 0.6761, Grad Norm: 4.7134, LR: 0.000020
Epoch 0, Iteration 90, Loss: 5.9445, Loss AR: 5.3802, Loss CFM: 0.5642, Grad Norm: 4.5889, LR: 0.000020
Epoch 0, Iteration 100, Loss: 5.8275, Loss AR: 5.2952, Loss CFM: 0.5323, Grad Norm: 4.2345, LR: 0.000020
Epoch 0, Iteration 110, Loss: 5.8989, Loss AR: 5.3299, Loss CFM: 0.5690, Grad Norm: 3.8339, LR: 0.000020
Epoch 0, Iteration 120, Loss: 5.9059, Loss AR: 5.3507, Loss CFM: 0.5552, Grad Norm: 4.4173, LR: 0.000020
Epoch 0, Iteration 130, Loss: 5.8601, Loss AR: 5.2886, Loss CFM: 0.5715, Grad Norm: 4.7186, LR: 0.000020
Epoch 0, Iteration 140, Loss: 5.7410, Loss AR: 5.1703, Loss CFM: 0.5707, Grad Norm: 5.0721, LR: 0.000020
Epoch 0, Iteration 150, Loss: 5.6626, Loss AR: 5.1165, Loss CFM: 0.5461, Grad Norm: 5.6720, LR: 0.000020
Epoch 0, Iteration 160, Loss: 5.6413, Loss AR: 5.1209, Loss CFM: 0.5204, Grad Norm: 5.3758, LR: 0.000020
Epoch 0, Iteration 170, Loss: 5.7048, Loss AR: 5.1517, Loss CFM: 0.5532, Grad Norm: 5.3928, LR: 0.000020
Epoch 0, Iteration 180, Loss: 5.5587, Loss AR: 5.0172, Loss CFM: 0.5415, Grad Norm: 5.4065, LR: 0.000020
Epoch 0, Iteration 190, Loss: 5.4207, Loss AR: 4.9393, Loss CFM: 0.4814, Grad Norm: 4.1240, LR: 0.000020
Epoch 0, Iteration 200, Loss: 5.6664, Loss AR: 5.0300, Loss CFM: 0.6364, Grad Norm: 6.8378, LR: 0.000020
Epoch 0, Iteration 210, Loss: 5.5199, Loss AR: 4.9741, Loss CFM: 0.5458, Grad Norm: 4.4962, LR: 0.000020
Epoch 0, Iteration 220, Loss: 5.7349, Loss AR: 5.1510, Loss CFM: 0.5839, Grad Norm: 4.6485, LR: 0.000020
Epoch 0, Iteration 230, Loss: 5.6674, Loss AR: 5.0937, Loss CFM: 0.5737, Grad Norm: 5.9289, LR: 0.000020
Epoch 0, Iteration 240, Loss: 5.5051, Loss AR: 5.0090, Loss CFM: 0.4960, Grad Norm: 3.6055, LR: 0.000020
Epoch 0, Iteration 250, Loss: 5.5708, Loss AR: 5.0226, Loss CFM: 0.5482, Grad Norm: 6.7605, LR: 0.000020
Epoch 0, Iteration 260, Loss: 6.3035, Loss AR: 5.7051, Loss CFM: 0.5985, Grad Norm: 6.0141, LR: 0.000020
Epoch 0, Iteration 270, Loss: 5.6583, Loss AR: 5.0007, Loss CFM: 0.6576, Grad Norm: 8.8000, LR: 0.000020
Epoch 0, Iteration 280, Loss: 5.5341, Loss AR: 5.0134, Loss CFM: 0.5208, Grad Norm: 4.0578, LR: 0.000020
Epoch 0, Iteration 290, Loss: 5.5949, Loss AR: 5.0421, Loss CFM: 0.5528, Grad Norm: 3.9655, LR: 0.000020
Epoch 0, Iteration 300, Loss: 5.5752, Loss AR: 5.0101, Loss CFM: 0.5651, Grad Norm: 3.2822, LR: 0.000020
Epoch 0, Iteration 310, Loss: 5.5713, Loss AR: 5.0474, Loss CFM: 0.5239, Grad Norm: 4.2575, LR: 0.000020
Epoch 0, Iteration 320, Loss: 5.3239, Loss AR: 4.8022, Loss CFM: 0.5217, Grad Norm: 4.8820, LR: 0.000020
Epoch 0, Iteration 330, Loss: 5.4435, Loss AR: 4.8896, Loss CFM: 0.5539, Grad Norm: 4.7381, LR: 0.000020
Epoch 0, Iteration 340, Loss: 5.6271, Loss AR: 5.0505, Loss CFM: 0.5766, Grad Norm: 4.3497, LR: 0.000020
Epoch 0, Iteration 350, Loss: 5.3990, Loss AR: 4.9110, Loss CFM: 0.4879, Grad Norm: 3.8114, LR: 0.000020
Epoch 0, Iteration 360, Loss: 5.2877, Loss AR: 4.7582, Loss CFM: 0.5294, Grad Norm: 5.0480, LR: 0.000020
Epoch 0, Iteration 370, Loss: 5.4688, Loss AR: 4.8899, Loss CFM: 0.5790, Grad Norm: 6.0723, LR: 0.000020
Epoch 0, Iteration 380, Loss: 5.3528, Loss AR: 4.8001, Loss CFM: 0.5527, Grad Norm: 4.8139, LR: 0.000020
Epoch 0, Iteration 390, Loss: 5.3339, Loss AR: 4.7649, Loss CFM: 0.5689, Grad Norm: 6.2641, LR: 0.000020
Epoch 0, Iteration 400, Loss: 5.5718, Loss AR: 5.0253, Loss CFM: 0.5465, Grad Norm: 4.4433, LR: 0.000020
Epoch 0, Iteration 410, Loss: 5.3135, Loss AR: 4.7553, Loss CFM: 0.5582, Grad Norm: 4.1293, LR: 0.000020
Epoch 0, Iteration 420, Loss: 5.2317, Loss AR: 4.7091, Loss CFM: 0.5226, Grad Norm: 3.7546, LR: 0.000020
Epoch 0, Iteration 430, Loss: 5.4425, Loss AR: 4.9118, Loss CFM: 0.5307, Grad Norm: 5.1922, LR: 0.000020
Epoch 0, Iteration 440, Loss: 5.1922, Loss AR: 4.6560, Loss CFM: 0.5362, Grad Norm: 4.5986, LR: 0.000020
Epoch 0, Iteration 450, Loss: 5.5871, Loss AR: 5.0075, Loss CFM: 0.5797, Grad Norm: 3.8599, LR: 0.000020
Epoch 0, Iteration 460, Loss: 5.2881, Loss AR: 4.7467, Loss CFM: 0.5414, Grad Norm: 5.2546, LR: 0.000020
Epoch 0, Iteration 470, Loss: 5.5572, Loss AR: 5.0190, Loss CFM: 0.5382, Grad Norm: 4.0549, LR: 0.000020
Epoch 0, Iteration 480, Loss: 5.5413, Loss AR: 4.9518, Loss CFM: 0.5894, Grad Norm: 4.7696, LR: 0.000020
Epoch 0, Iteration 490, Loss: 5.4549, Loss AR: 4.8403, Loss CFM: 0.6147, Grad Norm: 4.5709, LR: 0.000020
Epoch 0, Iteration 500, Loss: 5.5275, Loss AR: 4.9793, Loss CFM: 0.5482, Grad Norm: 6.3125, LR: 0.000020
Epoch 0, Iteration 510, Loss: 5.5529, Loss AR: 5.0058, Loss CFM: 0.5471, Grad Norm: 4.9021, LR: 0.000020
Epoch 0, Iteration 520, Loss: 5.2787, Loss AR: 4.7131, Loss CFM: 0.5656, Grad Norm: 4.3653, LR: 0.000020
Epoch 0, Iteration 530, Loss: 5.3460, Loss AR: 4.8713, Loss CFM: 0.4747, Grad Norm: 5.1378, LR: 0.000020
Epoch 0, Iteration 540, Loss: 5.5362, Loss AR: 4.9833, Loss CFM: 0.5529, Grad Norm: 4.7004, LR: 0.000020
Epoch 0, Iteration 550, Loss: 5.4108, Loss AR: 4.9068, Loss CFM: 0.5040, Grad Norm: 3.7669, LR: 0.000020
Epoch 0, Iteration 560, Loss: 5.5713, Loss AR: 4.9764, Loss CFM: 0.5949, Grad Norm: 5.0521, LR: 0.000020
Epoch 0, Iteration 570, Loss: 5.5925, Loss AR: 5.0070, Loss CFM: 0.5855, Grad Norm: 3.6271, LR: 0.000020
Epoch 0, Iteration 580, Loss: 5.4151, Loss AR: 4.8336, Loss CFM: 0.5814, Grad Norm: 3.7519, LR: 0.000020
Epoch 0, Iteration 590, Loss: 5.3009, Loss AR: 4.7907, Loss CFM: 0.5103, Grad Norm: 3.6174, LR: 0.000020
Epoch 0, Iteration 600, Loss: 5.4019, Loss AR: 4.8470, Loss CFM: 0.5548, Grad Norm: 3.8691, LR: 0.000020
Epoch 0, Iteration 610, Loss: 5.2526, Loss AR: 4.6560, Loss CFM: 0.5966, Grad Norm: 4.1214, LR: 0.000020
Epoch 0, Iteration 620, Loss: 5.4412, Loss AR: 4.8634, Loss CFM: 0.5777, Grad Norm: 6.7881, LR: 0.000020
Epoch 0, Iteration 630, Loss: 5.3791, Loss AR: 4.8449, Loss CFM: 0.5342, Grad Norm: 4.9049, LR: 0.000020
Epoch 0, Iteration 640, Loss: 5.4348, Loss AR: 4.8825, Loss CFM: 0.5523, Grad Norm: 4.5360, LR: 0.000020
Epoch 0, Iteration 650, Loss: 5.1263, Loss AR: 4.6611, Loss CFM: 0.4652, Grad Norm: 4.1793, LR: 0.000020
Epoch 0, Iteration 660, Loss: 5.4653, Loss AR: 4.8722, Loss CFM: 0.5931, Grad Norm: 5.3241, LR: 0.000020
Epoch 0, Iteration 670, Loss: 5.4785, Loss AR: 4.9457, Loss CFM: 0.5328, Grad Norm: 4.0256, LR: 0.000020
Epoch 0, Iteration 680, Loss: 5.5099, Loss AR: 4.8937, Loss CFM: 0.6162, Grad Norm: 6.9644, LR: 0.000020
Epoch 0, Iteration 690, Loss: 5.3313, Loss AR: 4.7925, Loss CFM: 0.5388, Grad Norm: 3.8656, LR: 0.000020
Epoch 0, Iteration 700, Loss: 5.2152, Loss AR: 4.6903, Loss CFM: 0.5249, Grad Norm: 5.5392, LR: 0.000020
Epoch 0, Iteration 710, Loss: 5.3868, Loss AR: 4.8118, Loss CFM: 0.5751, Grad Norm: 5.8774, LR: 0.000020
Epoch 0, Iteration 720, Loss: 5.2272, Loss AR: 4.6615, Loss CFM: 0.5657, Grad Norm: 4.0503, LR: 0.000020
Epoch 0, Iteration 730, Loss: 5.5152, Loss AR: 5.0051, Loss CFM: 0.5101, Grad Norm: 3.9390, LR: 0.000020
Epoch 0, Iteration 740, Loss: 5.4329, Loss AR: 4.8548, Loss CFM: 0.5781, Grad Norm: 5.8770, LR: 0.000020
Epoch 0, Iteration 750, Loss: 5.4860, Loss AR: 4.9289, Loss CFM: 0.5572, Grad Norm: 5.6004, LR: 0.000020
Epoch 0, Iteration 760, Loss: 5.1881, Loss AR: 4.6340, Loss CFM: 0.5541, Grad Norm: 5.2968, LR: 0.000020
Epoch 0, Iteration 770, Loss: 5.4245, Loss AR: 4.8049, Loss CFM: 0.6196, Grad Norm: 5.1651, LR: 0.000020
Epoch 0, Iteration 780, Loss: 5.3626, Loss AR: 4.7671, Loss CFM: 0.5956, Grad Norm: 10.4747, LR: 0.000020
Epoch 0, Iteration 790, Loss: 5.3641, Loss AR: 4.8039, Loss CFM: 0.5603, Grad Norm: 4.1450, LR: 0.000020
Epoch 0, Iteration 800, Loss: 5.3768, Loss AR: 4.7567, Loss CFM: 0.6202, Grad Norm: 4.3161, LR: 0.000020
Epoch 0, Iteration 810, Loss: 5.4846, Loss AR: 4.9275, Loss CFM: 0.5571, Grad Norm: 5.5100, LR: 0.000020
Epoch 0, Iteration 820, Loss: 5.4436, Loss AR: 4.9526, Loss CFM: 0.4910, Grad Norm: 4.3906, LR: 0.000020
Epoch 0, Iteration 830, Loss: 5.5320, Loss AR: 4.9500, Loss CFM: 0.5819, Grad Norm: 6.1807, LR: 0.000020
Epoch 0, Iteration 840, Loss: 5.4949, Loss AR: 4.9614, Loss CFM: 0.5335, Grad Norm: 4.1163, LR: 0.000020
Epoch 0, Iteration 850, Loss: 5.4410, Loss AR: 4.8215, Loss CFM: 0.6195, Grad Norm: 4.6125, LR: 0.000020
Epoch 0, Iteration 860, Loss: 5.3765, Loss AR: 4.8433, Loss CFM: 0.5332, Grad Norm: 4.1727, LR: 0.000020
Epoch 0, Iteration 870, Loss: 5.3839, Loss AR: 4.7210, Loss CFM: 0.6629, Grad Norm: 4.5247, LR: 0.000020
Epoch 0, Iteration 880, Loss: 5.6393, Loss AR: 4.9504, Loss CFM: 0.6890, Grad Norm: 4.7140, LR: 0.000020
Epoch 0, Iteration 890, Loss: 5.1932, Loss AR: 4.6423, Loss CFM: 0.5509, Grad Norm: 4.1024, LR: 0.000020
Epoch 0, Iteration 900, Loss: 5.0109, Loss AR: 4.5241, Loss CFM: 0.4867, Grad Norm: 4.1147, LR: 0.000020
Epoch 0, Iteration 910, Loss: 5.2314, Loss AR: 4.6638, Loss CFM: 0.5676, Grad Norm: 5.3412, LR: 0.000020
Epoch 0, Iteration 920, Loss: 5.3463, Loss AR: 4.7689, Loss CFM: 0.5775, Grad Norm: 5.0088, LR: 0.000020
Epoch 0, Iteration 930, Loss: 5.5358, Loss AR: 4.9589, Loss CFM: 0.5770, Grad Norm: 5.6565, LR: 0.000020
Epoch 0, Iteration 940, Loss: 5.4264, Loss AR: 4.8325, Loss CFM: 0.5939, Grad Norm: 4.4748, LR: 0.000020
Epoch 0, Iteration 950, Loss: 5.2731, Loss AR: 4.7376, Loss CFM: 0.5355, Grad Norm: 4.0443, LR: 0.000020
Epoch 0, Iteration 960, Loss: 5.3462, Loss AR: 4.7533, Loss CFM: 0.5929, Grad Norm: 5.9732, LR: 0.000020
Epoch 0, Iteration 970, Loss: 5.2805, Loss AR: 4.7254, Loss CFM: 0.5551, Grad Norm: 3.9393, LR: 0.000020
Epoch 0, Iteration 980, Loss: 5.4612, Loss AR: 4.9317, Loss CFM: 0.5295, Grad Norm: 3.9520, LR: 0.000020
Epoch 0, Iteration 990, Loss: 5.3353, Loss AR: 4.7674, Loss CFM: 0.5678, Grad Norm: 4.4031, LR: 0.000020
Epoch 0, Iteration 1000, Loss: 5.2499, Loss AR: 4.7318, Loss CFM: 0.5181, Grad Norm: 4.7017, LR: 0.000020
Epoch 0, Iteration 1010, Loss: 5.3774, Loss AR: 4.7978, Loss CFM: 0.5796, Grad Norm: 4.2218, LR: 0.000020
Epoch 0, Iteration 1020, Loss: 5.3071, Loss AR: 4.8034, Loss CFM: 0.5037, Grad Norm: 3.8264, LR: 0.000020
Epoch 0, Iteration 1030, Loss: 5.2578, Loss AR: 4.7172, Loss CFM: 0.5406, Grad Norm: 6.4791, LR: 0.000020
Epoch 0, Iteration 1040, Loss: 5.6114, Loss AR: 5.0383, Loss CFM: 0.5731, Grad Norm: 6.3689, LR: 0.000020
Epoch 0, Iteration 1050, Loss: 5.2689, Loss AR: 4.7411, Loss CFM: 0.5278, Grad Norm: 5.2403, LR: 0.000020
Epoch 0, Iteration 1060, Loss: 5.2950, Loss AR: 4.7451, Loss CFM: 0.5500, Grad Norm: 3.4220, LR: 0.000020
Epoch 0, Iteration 1070, Loss: 5.3083, Loss AR: 4.7900, Loss CFM: 0.5183, Grad Norm: 4.5241, LR: 0.000020
Epoch 0, Iteration 1080, Loss: 5.4661, Loss AR: 4.9377, Loss CFM: 0.5285, Grad Norm: 7.2139, LR: 0.000020
Epoch 0, Iteration 1090, Loss: 5.2544, Loss AR: 4.7978, Loss CFM: 0.4566, Grad Norm: 5.6067, LR: 0.000020
Epoch 0, Iteration 1100, Loss: 5.3335, Loss AR: 4.7811, Loss CFM: 0.5524, Grad Norm: 6.8690, LR: 0.000020
Epoch 0, Iteration 1110, Loss: 5.3637, Loss AR: 4.7128, Loss CFM: 0.6510, Grad Norm: 6.0104, LR: 0.000020
Epoch 0, Iteration 1120, Loss: 5.4086, Loss AR: 4.7741, Loss CFM: 0.6345, Grad Norm: 8.8904, LR: 0.000020
Epoch 0, Iteration 1130, Loss: 5.4019, Loss AR: 4.8292, Loss CFM: 0.5727, Grad Norm: 5.0530, LR: 0.000020
Epoch 0, Iteration 1140, Loss: 5.1847, Loss AR: 4.6128, Loss CFM: 0.5719, Grad Norm: 4.7495, LR: 0.000020
Epoch 0, Iteration 1150, Loss: 5.5795, Loss AR: 5.0154, Loss CFM: 0.5642, Grad Norm: 4.0755, LR: 0.000020
Epoch 0, Iteration 1160, Loss: 5.3507, Loss AR: 4.8025, Loss CFM: 0.5483, Grad Norm: 3.6153, LR: 0.000020
Epoch 0, Iteration 1170, Loss: 5.2381, Loss AR: 4.7286, Loss CFM: 0.5095, Grad Norm: 5.6146, LR: 0.000020
Epoch 0, Iteration 1180, Loss: 5.5085, Loss AR: 4.9839, Loss CFM: 0.5246, Grad Norm: 5.0095, LR: 0.000020
Epoch 0, Iteration 1190, Loss: 5.1023, Loss AR: 4.5498, Loss CFM: 0.5524, Grad Norm: 4.1211, LR: 0.000020
Epoch 0, Iteration 1200, Loss: 5.2774, Loss AR: 4.6608, Loss CFM: 0.6166, Grad Norm: 4.5691, LR: 0.000020
Epoch 0, Iteration 1210, Loss: 5.3057, Loss AR: 4.7351, Loss CFM: 0.5706, Grad Norm: 4.2958, LR: 0.000020
Epoch 0, Iteration 1220, Loss: 5.2706, Loss AR: 4.6980, Loss CFM: 0.5726, Grad Norm: 3.8095, LR: 0.000020
Epoch 0, Iteration 1230, Loss: 5.4759, Loss AR: 4.7975, Loss CFM: 0.6784, Grad Norm: 5.7690, LR: 0.000020
Epoch 0, Iteration 1240, Loss: 5.3110, Loss AR: 4.7835, Loss CFM: 0.5275, Grad Norm: 4.3540, LR: 0.000020
Epoch 0, Iteration 1250, Loss: 5.0515, Loss AR: 4.5392, Loss CFM: 0.5123, Grad Norm: 4.1630, LR: 0.000020
Epoch 0, Iteration 1260, Loss: 5.2643, Loss AR: 4.7089, Loss CFM: 0.5554, Grad Norm: 6.8152, LR: 0.000020
Epoch 0, Iteration 1270, Loss: 5.3783, Loss AR: 4.8372, Loss CFM: 0.5411, Grad Norm: 3.6510, LR: 0.000020
Epoch 0, Iteration 1280, Loss: 5.5929, Loss AR: 4.9992, Loss CFM: 0.5937, Grad Norm: 4.3651, LR: 0.000020
Epoch 0, Iteration 1290, Loss: 5.2103, Loss AR: 4.6615, Loss CFM: 0.5488, Grad Norm: 3.9239, LR: 0.000020
Epoch 0, Iteration 1300, Loss: 5.2145, Loss AR: 4.6930, Loss CFM: 0.5215, Grad Norm: 3.8160, LR: 0.000020
Epoch 0, Iteration 1310, Loss: 5.3047, Loss AR: 4.7133, Loss CFM: 0.5914, Grad Norm: 3.5853, LR: 0.000020
Epoch 0, Iteration 1320, Loss: 5.1850, Loss AR: 4.6318, Loss CFM: 0.5531, Grad Norm: 3.8217, LR: 0.000020
Epoch 0, Iteration 1330, Loss: 5.4648, Loss AR: 4.9031, Loss CFM: 0.5616, Grad Norm: 6.4728, LR: 0.000020
Epoch 0, Iteration 1340, Loss: 5.0275, Loss AR: 4.5142, Loss CFM: 0.5133, Grad Norm: 3.3592, LR: 0.000020
Epoch 0, Iteration 1350, Loss: 5.0808, Loss AR: 4.5014, Loss CFM: 0.5794, Grad Norm: 4.6000, LR: 0.000020
Epoch 0, Iteration 1360, Loss: 5.2993, Loss AR: 4.7808, Loss CFM: 0.5185, Grad Norm: 3.7394, LR: 0.000020
Epoch 0, Iteration 1370, Loss: 5.4737, Loss AR: 4.9098, Loss CFM: 0.5640, Grad Norm: 4.0343, LR: 0.000020
Epoch 0, Iteration 1380, Loss: 5.5193, Loss AR: 4.9305, Loss CFM: 0.5888, Grad Norm: 4.3538, LR: 0.000020
Epoch 0, Iteration 1390, Loss: 5.5421, Loss AR: 5.0038, Loss CFM: 0.5383, Grad Norm: 5.8908, LR: 0.000020
Epoch 0, Iteration 1400, Loss: 5.2182, Loss AR: 4.6797, Loss CFM: 0.5385, Grad Norm: 4.4747, LR: 0.000020
Epoch 0, Iteration 1410, Loss: 5.3682, Loss AR: 4.8831, Loss CFM: 0.4851, Grad Norm: 5.1043, LR: 0.000020
Epoch 0, Iteration 1420, Loss: 5.2301, Loss AR: 4.6936, Loss CFM: 0.5365, Grad Norm: 4.8725, LR: 0.000020
Epoch 0, Iteration 1430, Loss: 5.3199, Loss AR: 4.8039, Loss CFM: 0.5160, Grad Norm: 3.4033, LR: 0.000020
Epoch 0, Iteration 1440, Loss: 5.4180, Loss AR: 4.8291, Loss CFM: 0.5889, Grad Norm: 4.3706, LR: 0.000020
Epoch 0, Iteration 1450, Loss: 5.3771, Loss AR: 4.8229, Loss CFM: 0.5542, Grad Norm: 4.1864, LR: 0.000020
Epoch 0, Iteration 1460, Loss: 5.3217, Loss AR: 4.7490, Loss CFM: 0.5728, Grad Norm: 4.2266, LR: 0.000020
Epoch 0, Iteration 1470, Loss: 5.3758, Loss AR: 4.8890, Loss CFM: 0.4868, Grad Norm: 5.7534, LR: 0.000020
Epoch 0, Iteration 1480, Loss: 5.1558, Loss AR: 4.6062, Loss CFM: 0.5496, Grad Norm: 5.2170, LR: 0.000020
Epoch 0, Iteration 1490, Loss: 5.3539, Loss AR: 4.8451, Loss CFM: 0.5088, Grad Norm: 6.2640, LR: 0.000020
Epoch 0, Iteration 1500, Loss: 5.5970, Loss AR: 4.7608, Loss CFM: 0.8362, Grad Norm: 4.1122, LR: 0.000020
Epoch 0, Iteration 1510, Loss: 5.1470, Loss AR: 4.6166, Loss CFM: 0.5303, Grad Norm: 3.6266, LR: 0.000020
Epoch 0, Iteration 1520, Loss: 5.3144, Loss AR: 4.8071, Loss CFM: 0.5073, Grad Norm: 5.4103, LR: 0.000020
Epoch 0, Iteration 1530, Loss: 5.2354, Loss AR: 4.6823, Loss CFM: 0.5531, Grad Norm: 3.8594, LR: 0.000020
Epoch 0, Iteration 1540, Loss: 5.4144, Loss AR: 4.8778, Loss CFM: 0.5366, Grad Norm: 4.0471, LR: 0.000020
Epoch 0, Iteration 1550, Loss: 5.1276, Loss AR: 4.5927, Loss CFM: 0.5349, Grad Norm: 4.8183, LR: 0.000020
Epoch 0, Iteration 1560, Loss: 5.3126, Loss AR: 4.7497, Loss CFM: 0.5629, Grad Norm: 4.2364, LR: 0.000020
Epoch 0, Iteration 1570, Loss: 5.2820, Loss AR: 4.7321, Loss CFM: 0.5498, Grad Norm: 4.0100, LR: 0.000020
Epoch 0, Iteration 1580, Loss: 5.1118, Loss AR: 4.5671, Loss CFM: 0.5446, Grad Norm: 3.1754, LR: 0.000020
Epoch 0, Iteration 1590, Loss: 5.4985, Loss AR: 4.8902, Loss CFM: 0.6083, Grad Norm: 5.4517, LR: 0.000020
Epoch 0, Iteration 1600, Loss: 5.2283, Loss AR: 4.6674, Loss CFM: 0.5609, Grad Norm: 3.2734, LR: 0.000020
Epoch 0, Iteration 1610, Loss: 5.3412, Loss AR: 4.7790, Loss CFM: 0.5622, Grad Norm: 4.8352, LR: 0.000020
Epoch 0, Iteration 1620, Loss: 5.1715, Loss AR: 4.6541, Loss CFM: 0.5175, Grad Norm: 4.2765, LR: 0.000020
Epoch 0, Iteration 1630, Loss: 5.3036, Loss AR: 4.7765, Loss CFM: 0.5270, Grad Norm: 5.2772, LR: 0.000020
Epoch 0, Iteration 1640, Loss: 5.3929, Loss AR: 4.8732, Loss CFM: 0.5198, Grad Norm: 4.2313, LR: 0.000020
Epoch 0, Iteration 1650, Loss: 5.3026, Loss AR: 4.7885, Loss CFM: 0.5141, Grad Norm: 3.5110, LR: 0.000020
Epoch 0, Iteration 1660, Loss: 5.8230, Loss AR: 5.2471, Loss CFM: 0.5760, Grad Norm: 9.3764, LR: 0.000020
Epoch 0, Iteration 1670, Loss: 5.3783, Loss AR: 4.8770, Loss CFM: 0.5012, Grad Norm: 4.6288, LR: 0.000020
Epoch 0, Iteration 1680, Loss: 5.2248, Loss AR: 4.6208, Loss CFM: 0.6040, Grad Norm: 3.5854, LR: 0.000020
Epoch 0, Iteration 1690, Loss: 5.3797, Loss AR: 4.8166, Loss CFM: 0.5631, Grad Norm: 5.4671, LR: 0.000020
Epoch 0, Iteration 1700, Loss: 5.4456, Loss AR: 4.8693, Loss CFM: 0.5764, Grad Norm: 5.1622, LR: 0.000020
Epoch 0, Iteration 1710, Loss: 5.3349, Loss AR: 4.8032, Loss CFM: 0.5317, Grad Norm: 3.6839, LR: 0.000020
Epoch 0, Iteration 1720, Loss: 5.1886, Loss AR: 4.6520, Loss CFM: 0.5366, Grad Norm: 5.7352, LR: 0.000020
Epoch 0, Iteration 1730, Loss: 5.2384, Loss AR: 4.6707, Loss CFM: 0.5677, Grad Norm: 3.9772, LR: 0.000020
Epoch 0, Iteration 1740, Loss: 5.3079, Loss AR: 4.7419, Loss CFM: 0.5660, Grad Norm: 4.1456, LR: 0.000020
Epoch 0, Iteration 1750, Loss: 5.2789, Loss AR: 4.6718, Loss CFM: 0.6071, Grad Norm: 3.9497, LR: 0.000020
Epoch 0, Iteration 1760, Loss: 5.2347, Loss AR: 4.7246, Loss CFM: 0.5101, Grad Norm: 4.3098, LR: 0.000020
Epoch 0, Iteration 1770, Loss: 5.1397, Loss AR: 4.5671, Loss CFM: 0.5726, Grad Norm: 4.4709, LR: 0.000020
Epoch 0, Iteration 1780, Loss: 5.2627, Loss AR: 4.7605, Loss CFM: 0.5022, Grad Norm: 4.5379, LR: 0.000020
Epoch 0, Iteration 1790, Loss: 5.2060, Loss AR: 4.6472, Loss CFM: 0.5588, Grad Norm: 4.8890, LR: 0.000020
Epoch 0, Iteration 1800, Loss: 5.2562, Loss AR: 4.6848, Loss CFM: 0.5713, Grad Norm: 4.8399, LR: 0.000020
Epoch 0, Iteration 1810, Loss: 5.1739, Loss AR: 4.6551, Loss CFM: 0.5188, Grad Norm: 8.0265, LR: 0.000020
Epoch 0, Iteration 1820, Loss: 5.3638, Loss AR: 4.8588, Loss CFM: 0.5050, Grad Norm: 4.2128, LR: 0.000020
Epoch 0, Iteration 1830, Loss: 5.2167, Loss AR: 4.6291, Loss CFM: 0.5876, Grad Norm: 3.3567, LR: 0.000020
Epoch 0, Iteration 1840, Loss: 5.4313, Loss AR: 4.8659, Loss CFM: 0.5653, Grad Norm: 3.8327, LR: 0.000020
Epoch 0, Iteration 1850, Loss: 5.0603, Loss AR: 4.5287, Loss CFM: 0.5316, Grad Norm: 5.0694, LR: 0.000020
Epoch 0, Iteration 1860, Loss: 5.3148, Loss AR: 4.7963, Loss CFM: 0.5185, Grad Norm: 3.7572, LR: 0.000020
Epoch 0, Iteration 1870, Loss: 5.3821, Loss AR: 4.8277, Loss CFM: 0.5544, Grad Norm: 4.9261, LR: 0.000020
Epoch 0, Iteration 1880, Loss: 5.2035, Loss AR: 4.6734, Loss CFM: 0.5301, Grad Norm: 4.5891, LR: 0.000020
Epoch 0, Iteration 1890, Loss: 5.1438, Loss AR: 4.6195, Loss CFM: 0.5242, Grad Norm: 4.6562, LR: 0.000020
Epoch 0, Iteration 1900, Loss: 5.5428, Loss AR: 4.9602, Loss CFM: 0.5826, Grad Norm: 5.5299, LR: 0.000020
Epoch 0, Iteration 1910, Loss: 5.3585, Loss AR: 4.8487, Loss CFM: 0.5099, Grad Norm: 4.9426, LR: 0.000020
Epoch 0, Iteration 1920, Loss: 5.3880, Loss AR: 4.8353, Loss CFM: 0.5527, Grad Norm: 4.5711, LR: 0.000020
Epoch 0, Iteration 1930, Loss: 5.2034, Loss AR: 4.6078, Loss CFM: 0.5956, Grad Norm: 4.3320, LR: 0.000020
Epoch 0, Iteration 1940, Loss: 5.5280, Loss AR: 4.8997, Loss CFM: 0.6283, Grad Norm: 4.4144, LR: 0.000020
Epoch 0, Iteration 1950, Loss: 5.2620, Loss AR: 4.7571, Loss CFM: 0.5050, Grad Norm: 5.5530, LR: 0.000020
Epoch 0, Iteration 1960, Loss: 5.1519, Loss AR: 4.6106, Loss CFM: 0.5413, Grad Norm: 5.6473, LR: 0.000020
Epoch 0, Iteration 1970, Loss: 5.4207, Loss AR: 4.7767, Loss CFM: 0.6441, Grad Norm: 3.7657, LR: 0.000020
Epoch 0, Iteration 1980, Loss: 5.1864, Loss AR: 4.6094, Loss CFM: 0.5771, Grad Norm: 4.9301, LR: 0.000020
Epoch 0, Iteration 1990, Loss: 5.4485, Loss AR: 4.8673, Loss CFM: 0.5812, Grad Norm: 4.6316, LR: 0.000020
Epoch 0, Iteration 2000, Loss: 5.0527, Loss AR: 4.4801, Loss CFM: 0.5726, Grad Norm: 4.6769, LR: 0.000020
Epoch 0, Iteration 2010, Loss: 5.1165, Loss AR: 4.5794, Loss CFM: 0.5371, Grad Norm: 5.7778, LR: 0.000020
Epoch 0, Iteration 2020, Loss: 5.1171, Loss AR: 4.5741, Loss CFM: 0.5430, Grad Norm: 3.7777, LR: 0.000020
Epoch 0, Iteration 2030, Loss: 5.2927, Loss AR: 4.7100, Loss CFM: 0.5827, Grad Norm: 4.7898, LR: 0.000020
Epoch 0, Iteration 2040, Loss: 5.2259, Loss AR: 4.6140, Loss CFM: 0.6120, Grad Norm: 7.1434, LR: 0.000020
Epoch 0, Iteration 2050, Loss: 5.1323, Loss AR: 4.6053, Loss CFM: 0.5270, Grad Norm: 4.0921, LR: 0.000020
Epoch 0, Iteration 2060, Loss: 5.1297, Loss AR: 4.5767, Loss CFM: 0.5530, Grad Norm: 4.2761, LR: 0.000020
Epoch 0, Iteration 2070, Loss: 5.2265, Loss AR: 4.6879, Loss CFM: 0.5386, Grad Norm: 5.0672, LR: 0.000020
Epoch 0, Iteration 2080, Loss: 5.4108, Loss AR: 4.8414, Loss CFM: 0.5694, Grad Norm: 4.0253, LR: 0.000020
Epoch 0, Iteration 2090, Loss: 5.0230, Loss AR: 4.4914, Loss CFM: 0.5316, Grad Norm: 4.6322, LR: 0.000020
Epoch 0, Iteration 2100, Loss: 5.3331, Loss AR: 4.7363, Loss CFM: 0.5968, Grad Norm: 6.1474, LR: 0.000020
Epoch 0, Iteration 2110, Loss: 5.0261, Loss AR: 4.5273, Loss CFM: 0.4987, Grad Norm: 3.5932, LR: 0.000020
Epoch 0, Iteration 2120, Loss: 5.1672, Loss AR: 4.6369, Loss CFM: 0.5303, Grad Norm: 5.6880, LR: 0.000020
Epoch 0, Iteration 2130, Loss: 5.3663, Loss AR: 4.8034, Loss CFM: 0.5630, Grad Norm: 5.2601, LR: 0.000020
Epoch 0, Iteration 2140, Loss: 5.1757, Loss AR: 4.6648, Loss CFM: 0.5109, Grad Norm: 3.6318, LR: 0.000020
Epoch 0, Iteration 2150, Loss: 5.1751, Loss AR: 4.6051, Loss CFM: 0.5700, Grad Norm: 3.5133, LR: 0.000020
Epoch 0, Iteration 2160, Loss: 5.1119, Loss AR: 4.5971, Loss CFM: 0.5148, Grad Norm: 3.4314, LR: 0.000020
Epoch 0, Iteration 2170, Loss: 5.1288, Loss AR: 4.6234, Loss CFM: 0.5054, Grad Norm: 3.6756, LR: 0.000020
Epoch 0, Iteration 2180, Loss: 5.2716, Loss AR: 4.7200, Loss CFM: 0.5516, Grad Norm: 4.7235, LR: 0.000020
Epoch 0, Iteration 2190, Loss: 5.1294, Loss AR: 4.5855, Loss CFM: 0.5439, Grad Norm: 4.9155, LR: 0.000020
Epoch 0, Iteration 2200, Loss: 5.3781, Loss AR: 4.8248, Loss CFM: 0.5533, Grad Norm: 4.5107, LR: 0.000020
Epoch 0, Iteration 2210, Loss: 5.0235, Loss AR: 4.4776, Loss CFM: 0.5459, Grad Norm: 4.8591, LR: 0.000020
Epoch 0, Iteration 2220, Loss: 5.1242, Loss AR: 4.6112, Loss CFM: 0.5130, Grad Norm: 4.4608, LR: 0.000020
Epoch 0, Iteration 2230, Loss: 5.3228, Loss AR: 4.8001, Loss CFM: 0.5227, Grad Norm: 4.7129, LR: 0.000020
Epoch 0, Iteration 2240, Loss: 5.2137, Loss AR: 4.6880, Loss CFM: 0.5257, Grad Norm: 4.1456, LR: 0.000020
Epoch 0, Iteration 2250, Loss: 5.3082, Loss AR: 4.6985, Loss CFM: 0.6097, Grad Norm: 4.5447, LR: 0.000020
Epoch 0, Iteration 2260, Loss: 5.1907, Loss AR: 4.6329, Loss CFM: 0.5578, Grad Norm: 5.6915, LR: 0.000020
Epoch 0, Iteration 2270, Loss: 5.4393, Loss AR: 4.8388, Loss CFM: 0.6005, Grad Norm: 5.5602, LR: 0.000020
Epoch 0, Iteration 2280, Loss: 5.1464, Loss AR: 4.5687, Loss CFM: 0.5777, Grad Norm: 3.3529, LR: 0.000020
Epoch 0, Iteration 2290, Loss: 5.4297, Loss AR: 4.8338, Loss CFM: 0.5959, Grad Norm: 3.5977, LR: 0.000020
Epoch 0, Iteration 2300, Loss: 5.3368, Loss AR: 4.7737, Loss CFM: 0.5632, Grad Norm: 4.5132, LR: 0.000020
Epoch 0, Iteration 2310, Loss: 5.3667, Loss AR: 4.7689, Loss CFM: 0.5979, Grad Norm: 6.1017, LR: 0.000020
Epoch 0, Iteration 2320, Loss: 5.1128, Loss AR: 4.5378, Loss CFM: 0.5750, Grad Norm: 4.9919, LR: 0.000020
Epoch 0, Iteration 2330, Loss: 5.5402, Loss AR: 5.0514, Loss CFM: 0.4887, Grad Norm: 4.4359, LR: 0.000020
Epoch 0, Iteration 2340, Loss: 5.3133, Loss AR: 4.7800, Loss CFM: 0.5333, Grad Norm: 5.2953, LR: 0.000020
Epoch 0, Iteration 2350, Loss: 5.2726, Loss AR: 4.7490, Loss CFM: 0.5236, Grad Norm: 4.7477, LR: 0.000020
Epoch 0, Iteration 2360, Loss: 5.2095, Loss AR: 4.6787, Loss CFM: 0.5307, Grad Norm: 3.7396, LR: 0.000020
Epoch 0, Iteration 2370, Loss: 5.2259, Loss AR: 4.6985, Loss CFM: 0.5273, Grad Norm: 4.2423, LR: 0.000020
Epoch 0, Iteration 2380, Loss: 5.4541, Loss AR: 4.8626, Loss CFM: 0.5915, Grad Norm: 3.9079, LR: 0.000020
Epoch 0, Iteration 2390, Loss: 5.0457, Loss AR: 4.5265, Loss CFM: 0.5191, Grad Norm: 2.9087, LR: 0.000020
Epoch 0, Iteration 2400, Loss: 5.2036, Loss AR: 4.7060, Loss CFM: 0.4976, Grad Norm: 4.6773, LR: 0.000020
Epoch 0, Iteration 2410, Loss: 5.3381, Loss AR: 4.8318, Loss CFM: 0.5063, Grad Norm: 4.0606, LR: 0.000020
Epoch 0, Iteration 2420, Loss: 5.5023, Loss AR: 4.9563, Loss CFM: 0.5460, Grad Norm: 4.7168, LR: 0.000020
Epoch 0, Iteration 2430, Loss: 5.2053, Loss AR: 4.5898, Loss CFM: 0.6154, Grad Norm: 4.4177, LR: 0.000020
Epoch 0, Iteration 2440, Loss: 5.1039, Loss AR: 4.5586, Loss CFM: 0.5453, Grad Norm: 4.0872, LR: 0.000020
Epoch 0, Iteration 2450, Loss: 5.1865, Loss AR: 4.6719, Loss CFM: 0.5146, Grad Norm: 3.7610, LR: 0.000020
Epoch 0, Iteration 2460, Loss: 5.3403, Loss AR: 4.7395, Loss CFM: 0.6008, Grad Norm: 4.0496, LR: 0.000020
Epoch 0, Iteration 2470, Loss: 5.1851, Loss AR: 4.5738, Loss CFM: 0.6113, Grad Norm: 6.0342, LR: 0.000020
Epoch 0, Iteration 2480, Loss: 5.2961, Loss AR: 4.6744, Loss CFM: 0.6217, Grad Norm: 5.3342, LR: 0.000020
Epoch 0, Iteration 2490, Loss: 5.1073, Loss AR: 4.4956, Loss CFM: 0.6116, Grad Norm: 4.1822, LR: 0.000020
Epoch 0, Iteration 2500, Loss: 5.1131, Loss AR: 4.5522, Loss CFM: 0.5609, Grad Norm: 5.6727, LR: 0.000020
Epoch 0, Iteration 2510, Loss: 5.0065, Loss AR: 4.5381, Loss CFM: 0.4684, Grad Norm: 4.4211, LR: 0.000020
Epoch 0, Iteration 2520, Loss: 5.2826, Loss AR: 4.6687, Loss CFM: 0.6139, Grad Norm: 7.6101, LR: 0.000020
Epoch 0, Iteration 2530, Loss: 5.2981, Loss AR: 4.7682, Loss CFM: 0.5299, Grad Norm: 3.6036, LR: 0.000020
Epoch 0, Iteration 2540, Loss: 4.9792, Loss AR: 4.4489, Loss CFM: 0.5303, Grad Norm: 4.0184, LR: 0.000020
Epoch 0, Iteration 2550, Loss: 5.3323, Loss AR: 4.7438, Loss CFM: 0.5885, Grad Norm: 5.1646, LR: 0.000020
Epoch 0, Iteration 2560, Loss: 5.0637, Loss AR: 4.5153, Loss CFM: 0.5485, Grad Norm: 3.9658, LR: 0.000020
Epoch 0, Iteration 2570, Loss: 5.3164, Loss AR: 4.6509, Loss CFM: 0.6655, Grad Norm: 5.4962, LR: 0.000020
Epoch 0, Iteration 2580, Loss: 5.1430, Loss AR: 4.6132, Loss CFM: 0.5298, Grad Norm: 4.0820, LR: 0.000020
Epoch 0, Iteration 2590, Loss: 5.2126, Loss AR: 4.6373, Loss CFM: 0.5753, Grad Norm: 3.5724, LR: 0.000020
Epoch 0, Iteration 2600, Loss: 5.1168, Loss AR: 4.6111, Loss CFM: 0.5058, Grad Norm: 5.1029, LR: 0.000020
Epoch 0, Iteration 2610, Loss: 5.0811, Loss AR: 4.5364, Loss CFM: 0.5447, Grad Norm: 3.8645, LR: 0.000020
Epoch 0, Iteration 2620, Loss: 5.1330, Loss AR: 4.5745, Loss CFM: 0.5586, Grad Norm: 9.2228, LR: 0.000020
Epoch 0, Iteration 2630, Loss: 5.1812, Loss AR: 4.6201, Loss CFM: 0.5612, Grad Norm: 6.0890, LR: 0.000020
Epoch 0, Iteration 2640, Loss: 5.2228, Loss AR: 4.7267, Loss CFM: 0.4961, Grad Norm: 4.1771, LR: 0.000020
Epoch 0, Iteration 2650, Loss: 5.2216, Loss AR: 4.6098, Loss CFM: 0.6118, Grad Norm: 3.7765, LR: 0.000020
Epoch 0, Iteration 2660, Loss: 5.1347, Loss AR: 4.6163, Loss CFM: 0.5184, Grad Norm: 4.8386, LR: 0.000020
Epoch 0, Iteration 2670, Loss: 5.1565, Loss AR: 4.6480, Loss CFM: 0.5084, Grad Norm: 3.3331, LR: 0.000020
Epoch 0, Iteration 2680, Loss: 5.1997, Loss AR: 4.6891, Loss CFM: 0.5106, Grad Norm: 4.1276, LR: 0.000020
Epoch 0, Iteration 2690, Loss: 5.2741, Loss AR: 4.7672, Loss CFM: 0.5069, Grad Norm: 3.6092, LR: 0.000020
Epoch 0, Iteration 2700, Loss: 5.1675, Loss AR: 4.5925, Loss CFM: 0.5750, Grad Norm: 3.7822, LR: 0.000020
Epoch 0, Iteration 2710, Loss: 5.6085, Loss AR: 5.1100, Loss CFM: 0.4985, Grad Norm: 5.6050, LR: 0.000020
Epoch 0, Iteration 2720, Loss: 5.0433, Loss AR: 4.4638, Loss CFM: 0.5795, Grad Norm: 4.6467, LR: 0.000020
Epoch 0, Iteration 2730, Loss: 5.3116, Loss AR: 4.7698, Loss CFM: 0.5417, Grad Norm: 5.5225, LR: 0.000020
Epoch 0, Iteration 2740, Loss: 5.1903, Loss AR: 4.6282, Loss CFM: 0.5621, Grad Norm: 6.2715, LR: 0.000020
Epoch 0, Iteration 2750, Loss: 5.1704, Loss AR: 4.6608, Loss CFM: 0.5096, Grad Norm: 4.3155, LR: 0.000020
Epoch 0, Iteration 2760, Loss: 5.6026, Loss AR: 4.9501, Loss CFM: 0.6525, Grad Norm: 4.3798, LR: 0.000020
Epoch 0, Iteration 2770, Loss: 5.2562, Loss AR: 4.6932, Loss CFM: 0.5630, Grad Norm: 3.8026, LR: 0.000020
Epoch 0, Iteration 2780, Loss: 4.8979, Loss AR: 4.3648, Loss CFM: 0.5331, Grad Norm: 4.1019, LR: 0.000020
Epoch 0, Iteration 2790, Loss: 5.3014, Loss AR: 4.6886, Loss CFM: 0.6128, Grad Norm: 4.7484, LR: 0.000020
Epoch 0, Iteration 2800, Loss: 5.2091, Loss AR: 4.6322, Loss CFM: 0.5769, Grad Norm: 5.2485, LR: 0.000020
Epoch 0, Iteration 2810, Loss: 5.2183, Loss AR: 4.6836, Loss CFM: 0.5347, Grad Norm: 4.4818, LR: 0.000020
Epoch 0, Iteration 2820, Loss: 5.2182, Loss AR: 4.6054, Loss CFM: 0.6128, Grad Norm: 3.4614, LR: 0.000020
Epoch 0, Iteration 2830, Loss: 5.0089, Loss AR: 4.4959, Loss CFM: 0.5129, Grad Norm: 4.1593, LR: 0.000020
Epoch 0, Iteration 2840, Loss: 5.2654, Loss AR: 4.6955, Loss CFM: 0.5699, Grad Norm: 4.4056, LR: 0.000020
Epoch 0, Iteration 2850, Loss: 5.4702, Loss AR: 4.8979, Loss CFM: 0.5723, Grad Norm: 4.8719, LR: 0.000020
Epoch 0, Iteration 2860, Loss: 5.4574, Loss AR: 4.8705, Loss CFM: 0.5868, Grad Norm: 5.7330, LR: 0.000020
Epoch 0, Iteration 2870, Loss: 5.2086, Loss AR: 4.5760, Loss CFM: 0.6326, Grad Norm: 5.7516, LR: 0.000020
Epoch 0, Iteration 2880, Loss: 5.1862, Loss AR: 4.6323, Loss CFM: 0.5539, Grad Norm: 4.3720, LR: 0.000020
Epoch 0, Iteration 2890, Loss: 5.3213, Loss AR: 4.7672, Loss CFM: 0.5541, Grad Norm: 5.4417, LR: 0.000020
Epoch 0, Iteration 2900, Loss: 5.3823, Loss AR: 4.8082, Loss CFM: 0.5740, Grad Norm: 5.8042, LR: 0.000020
Epoch 0, Iteration 2910, Loss: 5.2278, Loss AR: 4.6598, Loss CFM: 0.5679, Grad Norm: 4.9443, LR: 0.000020
Epoch 0, Iteration 2920, Loss: 5.3527, Loss AR: 4.8089, Loss CFM: 0.5438, Grad Norm: 3.8902, LR: 0.000020
Epoch 0, Iteration 2930, Loss: 5.1350, Loss AR: 4.6239, Loss CFM: 0.5110, Grad Norm: 4.0301, LR: 0.000020
Epoch 0, Iteration 2940, Loss: 5.0664, Loss AR: 4.5591, Loss CFM: 0.5073, Grad Norm: 4.3964, LR: 0.000020
Epoch 0, Iteration 2950, Loss: 5.3945, Loss AR: 4.8586, Loss CFM: 0.5359, Grad Norm: 4.1502, LR: 0.000020
Epoch 0, Iteration 2960, Loss: 5.1254, Loss AR: 4.5735, Loss CFM: 0.5519, Grad Norm: 6.3344, LR: 0.000020
Epoch 0, Iteration 2970, Loss: 5.2742, Loss AR: 4.7055, Loss CFM: 0.5687, Grad Norm: 4.9588, LR: 0.000020
Epoch 0, Iteration 2980, Loss: 4.9883, Loss AR: 4.4524, Loss CFM: 0.5359, Grad Norm: 4.0459, LR: 0.000020
Epoch 0, Iteration 2990, Loss: 5.4286, Loss AR: 4.9374, Loss CFM: 0.4912, Grad Norm: 5.5831, LR: 0.000020
Epoch 0, Iteration 3000, Loss: 5.3190, Loss AR: 4.7431, Loss CFM: 0.5759, Grad Norm: 3.3341, LR: 0.000020
Epoch 0, Iteration 3010, Loss: 4.9910, Loss AR: 4.4667, Loss CFM: 0.5243, Grad Norm: 3.9395, LR: 0.000020
Epoch 0, Iteration 3020, Loss: 4.9696, Loss AR: 4.4467, Loss CFM: 0.5229, Grad Norm: 3.8160, LR: 0.000020
Epoch 0, Iteration 3030, Loss: 5.3244, Loss AR: 4.7457, Loss CFM: 0.5788, Grad Norm: 4.9688, LR: 0.000020
Epoch 0, Iteration 3040, Loss: 5.1100, Loss AR: 4.5201, Loss CFM: 0.5899, Grad Norm: 4.3474, LR: 0.000020
Epoch 0, Iteration 3050, Loss: 5.1900, Loss AR: 4.6454, Loss CFM: 0.5447, Grad Norm: 5.0317, LR: 0.000020
Epoch 0, Iteration 3060, Loss: 5.3986, Loss AR: 4.8230, Loss CFM: 0.5755, Grad Norm: 5.9226, LR: 0.000020
Epoch 0, Iteration 3070, Loss: 5.1842, Loss AR: 4.5859, Loss CFM: 0.5984, Grad Norm: 4.5289, LR: 0.000020
Epoch 0, Iteration 3080, Loss: 5.3056, Loss AR: 4.7107, Loss CFM: 0.5949, Grad Norm: 4.9412, LR: 0.000020
Epoch 0, Iteration 3090, Loss: 5.0552, Loss AR: 4.5201, Loss CFM: 0.5351, Grad Norm: 4.6003, LR: 0.000020
Epoch 0, Iteration 3100, Loss: 5.3653, Loss AR: 4.7793, Loss CFM: 0.5860, Grad Norm: 3.8735, LR: 0.000020
Epoch 0, Iteration 3110, Loss: 5.2162, Loss AR: 4.6558, Loss CFM: 0.5605, Grad Norm: 5.4971, LR: 0.000020
Epoch 0, Iteration 3120, Loss: 5.1918, Loss AR: 4.6477, Loss CFM: 0.5441, Grad Norm: 4.2606, LR: 0.000020
Epoch 0, Iteration 3130, Loss: 5.4369, Loss AR: 4.8115, Loss CFM: 0.6254, Grad Norm: 3.7867, LR: 0.000020
Epoch 0, Iteration 3140, Loss: 5.4135, Loss AR: 4.8815, Loss CFM: 0.5321, Grad Norm: 3.1986, LR: 0.000020
Epoch 0, Iteration 3150, Loss: 4.9876, Loss AR: 4.4569, Loss CFM: 0.5308, Grad Norm: 4.2216, LR: 0.000020
Epoch 0, Iteration 3160, Loss: 5.0239, Loss AR: 4.4690, Loss CFM: 0.5549, Grad Norm: 6.0823, LR: 0.000020
Epoch 0, Iteration 3170, Loss: 5.0898, Loss AR: 4.5811, Loss CFM: 0.5086, Grad Norm: 2.7628, LR: 0.000020
Epoch 0, Iteration 3180, Loss: 5.1542, Loss AR: 4.5881, Loss CFM: 0.5662, Grad Norm: 4.4643, LR: 0.000020
Epoch 0, Iteration 3190, Loss: 5.2111, Loss AR: 4.6695, Loss CFM: 0.5416, Grad Norm: 5.8797, LR: 0.000020
Epoch 0, Iteration 3200, Loss: 6.0603, Loss AR: 5.4990, Loss CFM: 0.5613, Grad Norm: 5.0584, LR: 0.000020
Epoch 0, Iteration 3210, Loss: 5.2640, Loss AR: 4.6656, Loss CFM: 0.5983, Grad Norm: 4.6163, LR: 0.000020
Epoch 0, Iteration 3220, Loss: 5.1655, Loss AR: 4.6248, Loss CFM: 0.5407, Grad Norm: 3.3978, LR: 0.000020
Epoch 0, Iteration 3230, Loss: 5.1965, Loss AR: 4.6550, Loss CFM: 0.5416, Grad Norm: 5.0457, LR: 0.000020
Epoch 0, Iteration 3240, Loss: 5.2040, Loss AR: 4.6618, Loss CFM: 0.5422, Grad Norm: 4.3978, LR: 0.000020
Epoch 0, Iteration 3250, Loss: 5.2667, Loss AR: 4.6876, Loss CFM: 0.5792, Grad Norm: 3.1898, LR: 0.000020
Epoch 0, Iteration 3260, Loss: 5.4368, Loss AR: 4.8574, Loss CFM: 0.5795, Grad Norm: 5.0969, LR: 0.000020
Epoch 0, Iteration 3270, Loss: 5.2556, Loss AR: 4.6802, Loss CFM: 0.5754, Grad Norm: 3.8848, LR: 0.000020
Epoch 0, Iteration 3280, Loss: 5.2943, Loss AR: 4.7338, Loss CFM: 0.5605, Grad Norm: 3.5393, LR: 0.000020
Epoch 0, Iteration 3290, Loss: 5.3801, Loss AR: 4.8399, Loss CFM: 0.5402, Grad Norm: 4.0477, LR: 0.000020
Epoch 0, Iteration 3300, Loss: 5.0078, Loss AR: 4.4827, Loss CFM: 0.5251, Grad Norm: 3.8989, LR: 0.000020
Epoch 0, Iteration 3310, Loss: 5.1769, Loss AR: 4.6703, Loss CFM: 0.5066, Grad Norm: 4.7486, LR: 0.000020
Epoch 0, Iteration 3320, Loss: 5.1945, Loss AR: 4.6580, Loss CFM: 0.5365, Grad Norm: 4.8103, LR: 0.000020
Epoch 0, Iteration 3330, Loss: 5.5145, Loss AR: 4.9193, Loss CFM: 0.5953, Grad Norm: 4.8059, LR: 0.000020
Epoch 0, Iteration 3340, Loss: 5.0385, Loss AR: 4.4646, Loss CFM: 0.5739, Grad Norm: 3.4584, LR: 0.000020
Epoch 0, Iteration 3350, Loss: 5.0324, Loss AR: 4.5010, Loss CFM: 0.5314, Grad Norm: 3.3700, LR: 0.000020
Epoch 0, Iteration 3360, Loss: 5.1626, Loss AR: 4.6041, Loss CFM: 0.5585, Grad Norm: 3.7375, LR: 0.000020
Epoch 0, Iteration 3370, Loss: 5.3425, Loss AR: 4.7956, Loss CFM: 0.5470, Grad Norm: 4.1336, LR: 0.000020
Epoch 0, Iteration 3380, Loss: 5.3441, Loss AR: 4.8113, Loss CFM: 0.5328, Grad Norm: 4.9706, LR: 0.000020
Epoch 0, Iteration 3390, Loss: 5.1720, Loss AR: 4.6182, Loss CFM: 0.5538, Grad Norm: 4.4101, LR: 0.000020
Epoch 0, Iteration 3400, Loss: 4.9802, Loss AR: 4.4088, Loss CFM: 0.5714, Grad Norm: 3.9702, LR: 0.000020
Epoch 0, Iteration 3410, Loss: 5.2566, Loss AR: 4.6608, Loss CFM: 0.5958, Grad Norm: 6.1982, LR: 0.000020
Epoch 0, Iteration 3420, Loss: 5.2172, Loss AR: 4.6515, Loss CFM: 0.5657, Grad Norm: 3.3650, LR: 0.000020
Epoch 0, Iteration 3430, Loss: 5.3554, Loss AR: 4.7433, Loss CFM: 0.6121, Grad Norm: 3.8110, LR: 0.000020
Epoch 0, Iteration 3440, Loss: 5.5116, Loss AR: 4.9750, Loss CFM: 0.5367, Grad Norm: 5.1618, LR: 0.000020
Epoch 0, Iteration 3450, Loss: 5.1466, Loss AR: 4.6016, Loss CFM: 0.5450, Grad Norm: 3.4579, LR: 0.000020
Epoch 0, Iteration 3460, Loss: 5.2869, Loss AR: 4.7417, Loss CFM: 0.5453, Grad Norm: 5.3863, LR: 0.000020
Epoch 0, Iteration 3470, Loss: 4.8747, Loss AR: 4.3771, Loss CFM: 0.4977, Grad Norm: 3.8765, LR: 0.000020
Epoch 0, Iteration 3480, Loss: 4.9749, Loss AR: 4.3755, Loss CFM: 0.5994, Grad Norm: 4.4025, LR: 0.000020
Epoch 0, Iteration 3490, Loss: 5.1394, Loss AR: 4.5192, Loss CFM: 0.6202, Grad Norm: 3.9735, LR: 0.000020
Epoch 0, Iteration 3500, Loss: 5.2368, Loss AR: 4.7298, Loss CFM: 0.5070, Grad Norm: 9.6594, LR: 0.000020
Epoch 0, Iteration 3510, Loss: 5.2153, Loss AR: 4.6607, Loss CFM: 0.5546, Grad Norm: 3.2842, LR: 0.000020
Epoch 0, Iteration 3520, Loss: 5.1805, Loss AR: 4.6524, Loss CFM: 0.5280, Grad Norm: 5.9202, LR: 0.000020
Epoch 0, Iteration 3530, Loss: 4.9695, Loss AR: 4.4989, Loss CFM: 0.4706, Grad Norm: 3.8385, LR: 0.000020
Epoch 0, Iteration 3540, Loss: 5.0584, Loss AR: 4.5414, Loss CFM: 0.5170, Grad Norm: 2.6697, LR: 0.000020
Epoch 0, Iteration 3550, Loss: 5.3554, Loss AR: 4.8379, Loss CFM: 0.5175, Grad Norm: 4.2529, LR: 0.000020
Epoch 0, Iteration 3560, Loss: 5.1825, Loss AR: 4.5412, Loss CFM: 0.6413, Grad Norm: 4.5557, LR: 0.000020
Epoch 0, Iteration 3570, Loss: 5.4152, Loss AR: 4.8766, Loss CFM: 0.5386, Grad Norm: 3.4779, LR: 0.000020
Epoch 0, Iteration 3580, Loss: 5.0149, Loss AR: 4.4564, Loss CFM: 0.5585, Grad Norm: 4.6781, LR: 0.000020
Epoch 0, Iteration 3590, Loss: 5.2063, Loss AR: 4.6238, Loss CFM: 0.5824, Grad Norm: 3.9851, LR: 0.000020
Epoch 0, Iteration 3600, Loss: 5.1051, Loss AR: 4.5820, Loss CFM: 0.5231, Grad Norm: 4.5686, LR: 0.000020
Epoch 0, Iteration 3610, Loss: 5.0414, Loss AR: 4.4732, Loss CFM: 0.5682, Grad Norm: 3.2006, LR: 0.000020
Epoch 0, Iteration 3620, Loss: 5.3928, Loss AR: 4.8715, Loss CFM: 0.5213, Grad Norm: 4.0729, LR: 0.000020
Epoch 0, Iteration 3630, Loss: 5.1907, Loss AR: 4.6387, Loss CFM: 0.5520, Grad Norm: 6.2411, LR: 0.000020
Epoch 0, Iteration 3640, Loss: 5.0021, Loss AR: 4.4215, Loss CFM: 0.5805, Grad Norm: 4.0409, LR: 0.000020
Epoch 0, Iteration 3650, Loss: 5.2853, Loss AR: 4.6934, Loss CFM: 0.5919, Grad Norm: 4.1106, LR: 0.000020
Epoch 0, Iteration 3660, Loss: 4.9946, Loss AR: 4.4537, Loss CFM: 0.5409, Grad Norm: 3.3245, LR: 0.000020
Epoch 0, Iteration 3670, Loss: 5.0366, Loss AR: 4.5071, Loss CFM: 0.5295, Grad Norm: 3.5969, LR: 0.000020
Epoch 0, Iteration 3680, Loss: 5.1130, Loss AR: 4.6150, Loss CFM: 0.4980, Grad Norm: 3.5030, LR: 0.000020
Epoch 0, Iteration 3690, Loss: 5.1972, Loss AR: 4.6331, Loss CFM: 0.5641, Grad Norm: 4.2700, LR: 0.000020
Epoch 0, Iteration 3700, Loss: 5.0522, Loss AR: 4.4009, Loss CFM: 0.6512, Grad Norm: 3.8618, LR: 0.000020
Epoch 0, Iteration 3710, Loss: 4.9017, Loss AR: 4.4141, Loss CFM: 0.4877, Grad Norm: 3.7948, LR: 0.000020
Epoch 0, Iteration 3720, Loss: 5.4565, Loss AR: 4.8394, Loss CFM: 0.6171, Grad Norm: 4.5469, LR: 0.000020
Epoch 0, Iteration 3730, Loss: 4.9446, Loss AR: 4.3816, Loss CFM: 0.5630, Grad Norm: 4.2468, LR: 0.000020
Epoch 0, Iteration 3740, Loss: 5.1005, Loss AR: 4.5518, Loss CFM: 0.5487, Grad Norm: 3.6193, LR: 0.000020
Epoch 0, Iteration 3750, Loss: 5.1878, Loss AR: 4.6628, Loss CFM: 0.5250, Grad Norm: 3.4427, LR: 0.000020
Epoch 0, Iteration 3760, Loss: 5.2894, Loss AR: 4.6763, Loss CFM: 0.6130, Grad Norm: 3.2890, LR: 0.000020
Epoch 0, Iteration 3770, Loss: 4.9524, Loss AR: 4.4568, Loss CFM: 0.4955, Grad Norm: 4.1026, LR: 0.000020
Epoch 0, Iteration 3780, Loss: 4.9397, Loss AR: 4.4136, Loss CFM: 0.5261, Grad Norm: 4.3419, LR: 0.000020
Epoch 0, Iteration 3790, Loss: 5.4712, Loss AR: 4.9108, Loss CFM: 0.5604, Grad Norm: 5.1905, LR: 0.000020
Epoch 0, Iteration 3800, Loss: 5.3296, Loss AR: 4.7767, Loss CFM: 0.5529, Grad Norm: 7.0093, LR: 0.000020
Epoch 0, Iteration 3810, Loss: 5.4127, Loss AR: 4.8146, Loss CFM: 0.5981, Grad Norm: 3.1689, LR: 0.000020
Epoch 0, Iteration 3820, Loss: 5.1680, Loss AR: 4.6152, Loss CFM: 0.5528, Grad Norm: 4.2450, LR: 0.000020
Epoch 0, Iteration 3830, Loss: 5.0678, Loss AR: 4.4802, Loss CFM: 0.5876, Grad Norm: 4.8423, LR: 0.000020
Epoch 0, Iteration 3840, Loss: 5.3027, Loss AR: 4.6905, Loss CFM: 0.6122, Grad Norm: 3.8560, LR: 0.000020
Epoch 0, Iteration 3850, Loss: 5.4855, Loss AR: 4.9272, Loss CFM: 0.5583, Grad Norm: 3.8645, LR: 0.000020
Epoch 0, Iteration 3860, Loss: 4.9900, Loss AR: 4.5014, Loss CFM: 0.4886, Grad Norm: 3.4681, LR: 0.000020
Epoch 0, Iteration 3870, Loss: 5.2744, Loss AR: 4.7648, Loss CFM: 0.5096, Grad Norm: 4.1345, LR: 0.000020
Epoch 0, Iteration 3880, Loss: 5.3815, Loss AR: 4.8070, Loss CFM: 0.5745, Grad Norm: 5.1595, LR: 0.000020
Epoch 0, Iteration 3890, Loss: 5.1659, Loss AR: 4.6021, Loss CFM: 0.5638, Grad Norm: 4.3199, LR: 0.000020
Epoch 0, Iteration 3900, Loss: 5.3394, Loss AR: 4.7871, Loss CFM: 0.5524, Grad Norm: 4.0214, LR: 0.000020
Epoch 0, Iteration 3910, Loss: 5.1287, Loss AR: 4.6056, Loss CFM: 0.5231, Grad Norm: 3.6093, LR: 0.000020
Epoch 0, Iteration 3920, Loss: 5.0801, Loss AR: 4.5232, Loss CFM: 0.5569, Grad Norm: 3.0531, LR: 0.000020
Epoch 0, Iteration 3930, Loss: 5.2588, Loss AR: 4.6864, Loss CFM: 0.5724, Grad Norm: 4.5224, LR: 0.000020
Epoch 0, Iteration 3940, Loss: 5.2636, Loss AR: 4.6694, Loss CFM: 0.5943, Grad Norm: 4.8293, LR: 0.000020
Epoch 0, Iteration 3950, Loss: 5.3233, Loss AR: 4.7442, Loss CFM: 0.5792, Grad Norm: 3.6305, LR: 0.000020
Epoch 0, Iteration 3960, Loss: 4.9952, Loss AR: 4.4491, Loss CFM: 0.5461, Grad Norm: 5.2930, LR: 0.000020
Epoch 0, Iteration 3970, Loss: 5.2400, Loss AR: 4.6169, Loss CFM: 0.6231, Grad Norm: 4.0380, LR: 0.000020
Epoch 0, Iteration 3980, Loss: 5.2095, Loss AR: 4.6550, Loss CFM: 0.5545, Grad Norm: 4.6260, LR: 0.000020
Epoch 0, Iteration 3990, Loss: 5.0922, Loss AR: 4.5512, Loss CFM: 0.5411, Grad Norm: 4.9227, LR: 0.000020
Epoch 0, Iteration 4000, Loss: 5.3100, Loss AR: 4.7311, Loss CFM: 0.5790, Grad Norm: 4.3938, LR: 0.000020
Epoch 0, Iteration 4010, Loss: 5.0111, Loss AR: 4.4400, Loss CFM: 0.5711, Grad Norm: 3.5561, LR: 0.000020
Epoch 0, Iteration 4020, Loss: 5.2034, Loss AR: 4.6121, Loss CFM: 0.5913, Grad Norm: 3.8797, LR: 0.000020
Epoch 0, Iteration 4030, Loss: 5.2012, Loss AR: 4.6273, Loss CFM: 0.5739, Grad Norm: 4.9909, LR: 0.000020
Epoch 0, Iteration 4040, Loss: 5.3233, Loss AR: 4.7229, Loss CFM: 0.6004, Grad Norm: 5.8704, LR: 0.000020
Epoch 0, Iteration 4050, Loss: 5.2364, Loss AR: 4.7726, Loss CFM: 0.4637, Grad Norm: 4.0090, LR: 0.000020
Epoch 0, Iteration 4060, Loss: 5.0472, Loss AR: 4.4791, Loss CFM: 0.5681, Grad Norm: 4.2106, LR: 0.000020
Epoch 0, Iteration 4070, Loss: 5.4700, Loss AR: 4.8943, Loss CFM: 0.5757, Grad Norm: 5.1211, LR: 0.000020
Epoch 0, Iteration 4080, Loss: 5.1098, Loss AR: 4.6263, Loss CFM: 0.4835, Grad Norm: 4.2066, LR: 0.000020
Epoch 0, Iteration 4090, Loss: 5.1301, Loss AR: 4.6067, Loss CFM: 0.5235, Grad Norm: 5.4670, LR: 0.000020
Epoch 0, Iteration 4100, Loss: 5.2128, Loss AR: 4.5966, Loss CFM: 0.6162, Grad Norm: 3.2383, LR: 0.000020
Epoch 0, Iteration 4110, Loss: 4.9089, Loss AR: 4.3353, Loss CFM: 0.5736, Grad Norm: 4.1087, LR: 0.000020
Epoch 0, Iteration 4120, Loss: 5.4557, Loss AR: 4.9148, Loss CFM: 0.5409, Grad Norm: 4.2101, LR: 0.000020
Epoch 0, Iteration 4130, Loss: 5.1213, Loss AR: 4.6155, Loss CFM: 0.5058, Grad Norm: 3.4699, LR: 0.000020
Epoch 0, Iteration 4140, Loss: 5.2559, Loss AR: 4.6821, Loss CFM: 0.5738, Grad Norm: 5.4637, LR: 0.000020
Epoch 0, Iteration 4150, Loss: 5.0868, Loss AR: 4.4945, Loss CFM: 0.5923, Grad Norm: 4.5660, LR: 0.000020
Epoch 0, Iteration 4160, Loss: 5.2445, Loss AR: 4.6890, Loss CFM: 0.5555, Grad Norm: 3.3122, LR: 0.000020
Epoch 0, Iteration 4170, Loss: 4.8805, Loss AR: 4.3249, Loss CFM: 0.5556, Grad Norm: 4.4338, LR: 0.000020
Epoch 0, Iteration 4180, Loss: 5.4939, Loss AR: 4.9182, Loss CFM: 0.5757, Grad Norm: 4.5077, LR: 0.000020
Epoch 0, Iteration 4190, Loss: 5.3550, Loss AR: 4.8244, Loss CFM: 0.5306, Grad Norm: 4.0603, LR: 0.000020
Epoch 0, Iteration 4200, Loss: 5.1150, Loss AR: 4.6131, Loss CFM: 0.5019, Grad Norm: 3.6687, LR: 0.000020
Epoch 0, Iteration 4210, Loss: 5.2354, Loss AR: 4.7135, Loss CFM: 0.5219, Grad Norm: 3.5424, LR: 0.000020
Epoch 0, Iteration 4220, Loss: 5.1757, Loss AR: 4.5709, Loss CFM: 0.6048, Grad Norm: 3.4266, LR: 0.000020
Epoch 0, Iteration 4230, Loss: 5.0531, Loss AR: 4.4225, Loss CFM: 0.6306, Grad Norm: 4.8382, LR: 0.000020
Epoch 0, Iteration 4240, Loss: 5.7841, Loss AR: 5.1638, Loss CFM: 0.6203, Grad Norm: 8.1015, LR: 0.000020
Epoch 0, Iteration 4250, Loss: 5.0322, Loss AR: 4.4734, Loss CFM: 0.5589, Grad Norm: 4.3146, LR: 0.000020
Epoch 0, Iteration 4260, Loss: 5.0390, Loss AR: 4.4551, Loss CFM: 0.5839, Grad Norm: 3.5779, LR: 0.000020
Epoch 0, Iteration 4270, Loss: 5.0206, Loss AR: 4.5157, Loss CFM: 0.5049, Grad Norm: 3.2843, LR: 0.000020
Epoch 0, Iteration 4280, Loss: 5.0887, Loss AR: 4.6138, Loss CFM: 0.4749, Grad Norm: 10.4175, LR: 0.000020
Epoch 0, Iteration 4290, Loss: 5.5458, Loss AR: 5.0021, Loss CFM: 0.5437, Grad Norm: 3.9863, LR: 0.000020
Epoch 0, Iteration 4300, Loss: 5.0825, Loss AR: 4.5268, Loss CFM: 0.5557, Grad Norm: 3.9744, LR: 0.000020
Epoch 0, Iteration 4310, Loss: 5.1923, Loss AR: 4.6035, Loss CFM: 0.5888, Grad Norm: 6.0375, LR: 0.000020
Epoch 0, Iteration 4320, Loss: 5.1429, Loss AR: 4.6126, Loss CFM: 0.5303, Grad Norm: 3.7605, LR: 0.000020
Epoch 0, Iteration 4330, Loss: 5.1378, Loss AR: 4.5337, Loss CFM: 0.6041, Grad Norm: 4.4965, LR: 0.000020
Epoch 0, Iteration 4340, Loss: 5.2015, Loss AR: 4.6866, Loss CFM: 0.5149, Grad Norm: 3.5540, LR: 0.000020
Epoch 0, Iteration 4350, Loss: 5.2099, Loss AR: 4.6680, Loss CFM: 0.5420, Grad Norm: 4.5768, LR: 0.000020
Epoch 0, Iteration 4360, Loss: 5.2289, Loss AR: 4.7593, Loss CFM: 0.4696, Grad Norm: 4.7600, LR: 0.000020
Epoch 0, Iteration 4370, Loss: 5.1915, Loss AR: 4.6785, Loss CFM: 0.5129, Grad Norm: 5.5728, LR: 0.000020
Epoch 0, Iteration 4380, Loss: 5.0542, Loss AR: 4.4622, Loss CFM: 0.5919, Grad Norm: 4.6095, LR: 0.000020
Epoch 0, Iteration 4390, Loss: 5.1515, Loss AR: 4.6510, Loss CFM: 0.5005, Grad Norm: 5.4092, LR: 0.000020
Epoch 0, Iteration 4400, Loss: 5.3196, Loss AR: 4.7475, Loss CFM: 0.5721, Grad Norm: 4.2366, LR: 0.000020
Epoch 0, Iteration 4410, Loss: 5.5932, Loss AR: 5.0123, Loss CFM: 0.5808, Grad Norm: 5.4554, LR: 0.000020
Epoch 0, Iteration 4420, Loss: 5.3365, Loss AR: 4.7284, Loss CFM: 0.6081, Grad Norm: 5.1646, LR: 0.000020
Epoch 0, Iteration 4430, Loss: 5.0109, Loss AR: 4.4674, Loss CFM: 0.5435, Grad Norm: 3.6984, LR: 0.000020
Epoch 0, Iteration 4440, Loss: 5.1466, Loss AR: 4.5869, Loss CFM: 0.5597, Grad Norm: 4.2085, LR: 0.000020
Epoch 0, Iteration 4450, Loss: 5.1047, Loss AR: 4.5355, Loss CFM: 0.5693, Grad Norm: 4.3220, LR: 0.000020
Epoch 0, Iteration 4460, Loss: 5.1955, Loss AR: 4.6294, Loss CFM: 0.5661, Grad Norm: 4.6711, LR: 0.000020
Epoch 0, Iteration 4470, Loss: 5.2068, Loss AR: 4.6105, Loss CFM: 0.5963, Grad Norm: 4.0307, LR: 0.000020
Epoch 0, Iteration 4480, Loss: 5.3105, Loss AR: 4.7585, Loss CFM: 0.5520, Grad Norm: 3.6493, LR: 0.000020
Epoch 0, Iteration 4490, Loss: 5.1798, Loss AR: 4.5912, Loss CFM: 0.5886, Grad Norm: 3.7571, LR: 0.000020
Epoch 0, Iteration 4500, Loss: 5.1747, Loss AR: 4.6209, Loss CFM: 0.5538, Grad Norm: 4.8189, LR: 0.000020
Epoch 0, Iteration 4510, Loss: 5.0490, Loss AR: 4.4811, Loss CFM: 0.5679, Grad Norm: 4.2379, LR: 0.000020
Epoch 0, Iteration 4520, Loss: 5.2357, Loss AR: 4.6795, Loss CFM: 0.5562, Grad Norm: 3.5573, LR: 0.000020
Epoch 0, Iteration 4530, Loss: 5.1454, Loss AR: 4.6258, Loss CFM: 0.5196, Grad Norm: 4.3397, LR: 0.000020
Epoch 0, Iteration 4540, Loss: 5.1395, Loss AR: 4.6644, Loss CFM: 0.4751, Grad Norm: 4.3965, LR: 0.000020
Epoch 0, Iteration 4550, Loss: 5.0004, Loss AR: 4.4699, Loss CFM: 0.5305, Grad Norm: 3.7255, LR: 0.000020
Epoch 0, Iteration 4560, Loss: 5.0704, Loss AR: 4.4838, Loss CFM: 0.5867, Grad Norm: 3.9032, LR: 0.000020
Epoch 0, Iteration 4570, Loss: 5.1532, Loss AR: 4.6179, Loss CFM: 0.5353, Grad Norm: 5.0712, LR: 0.000020
Epoch 0, Iteration 4580, Loss: 5.0354, Loss AR: 4.4612, Loss CFM: 0.5742, Grad Norm: 4.1780, LR: 0.000020
Epoch 0, Iteration 4590, Loss: 5.0910, Loss AR: 4.5471, Loss CFM: 0.5439, Grad Norm: 4.4358, LR: 0.000020
Epoch 0, Iteration 4600, Loss: 5.6672, Loss AR: 5.1397, Loss CFM: 0.5275, Grad Norm: 6.2034, LR: 0.000020
Epoch 0, Iteration 4610, Loss: 5.1335, Loss AR: 4.5122, Loss CFM: 0.6213, Grad Norm: 5.7142, LR: 0.000020
Epoch 0, Iteration 4620, Loss: 5.1945, Loss AR: 4.6653, Loss CFM: 0.5291, Grad Norm: 5.1503, LR: 0.000020
Epoch 0, Iteration 4630, Loss: 5.1225, Loss AR: 4.5539, Loss CFM: 0.5686, Grad Norm: 4.9242, LR: 0.000020
Epoch 0, Iteration 4640, Loss: 5.0881, Loss AR: 4.5064, Loss CFM: 0.5817, Grad Norm: 3.6347, LR: 0.000020
Epoch 0, Iteration 4650, Loss: 5.2223, Loss AR: 4.7375, Loss CFM: 0.4848, Grad Norm: 5.4815, LR: 0.000020
Epoch 0, Iteration 4660, Loss: 5.2759, Loss AR: 4.7641, Loss CFM: 0.5118, Grad Norm: 5.5662, LR: 0.000020
Epoch 0, Iteration 4670, Loss: 5.1228, Loss AR: 4.5377, Loss CFM: 0.5851, Grad Norm: 5.0795, LR: 0.000020
Epoch 0, Iteration 4680, Loss: 5.2081, Loss AR: 4.6928, Loss CFM: 0.5153, Grad Norm: 4.7298, LR: 0.000020
Epoch 0, Iteration 4690, Loss: 5.1715, Loss AR: 4.5993, Loss CFM: 0.5722, Grad Norm: 5.8663, LR: 0.000020
Epoch 0, Iteration 4700, Loss: 5.0551, Loss AR: 4.5535, Loss CFM: 0.5016, Grad Norm: 4.5353, LR: 0.000020
Epoch 0, Iteration 4710, Loss: 5.2414, Loss AR: 4.6319, Loss CFM: 0.6095, Grad Norm: 5.7492, LR: 0.000020
Epoch 0, Iteration 4720, Loss: 4.9003, Loss AR: 4.3034, Loss CFM: 0.5969, Grad Norm: 5.0535, LR: 0.000020
Epoch 0, Iteration 4730, Loss: 5.1061, Loss AR: 4.5334, Loss CFM: 0.5727, Grad Norm: 4.0350, LR: 0.000020
Epoch 0, Iteration 4740, Loss: 4.9646, Loss AR: 4.4760, Loss CFM: 0.4886, Grad Norm: 2.9805, LR: 0.000020
Epoch 0, Iteration 4750, Loss: 5.2108, Loss AR: 4.5169, Loss CFM: 0.6939, Grad Norm: 4.5447, LR: 0.000020
Epoch 0, Iteration 4760, Loss: 5.1406, Loss AR: 4.5814, Loss CFM: 0.5592, Grad Norm: 5.1753, LR: 0.000020
Epoch 0, Iteration 4770, Loss: 5.0580, Loss AR: 4.5254, Loss CFM: 0.5327, Grad Norm: 4.3070, LR: 0.000020
Epoch 0, Iteration 4780, Loss: 5.1434, Loss AR: 4.5862, Loss CFM: 0.5572, Grad Norm: 3.4049, LR: 0.000020
Epoch 0, Iteration 4790, Loss: 4.9249, Loss AR: 4.3512, Loss CFM: 0.5737, Grad Norm: 3.6433, LR: 0.000020
Epoch 0, Iteration 4800, Loss: 5.3734, Loss AR: 4.8043, Loss CFM: 0.5690, Grad Norm: 7.3673, LR: 0.000020
Epoch 0, Iteration 4810, Loss: 5.1595, Loss AR: 4.6185, Loss CFM: 0.5410, Grad Norm: 4.1685, LR: 0.000020
Epoch 0, Iteration 4820, Loss: 5.1330, Loss AR: 4.6040, Loss CFM: 0.5291, Grad Norm: 4.1327, LR: 0.000020
Epoch 0, Iteration 4830, Loss: 5.2200, Loss AR: 4.6530, Loss CFM: 0.5670, Grad Norm: 3.9771, LR: 0.000020
Epoch 0, Iteration 4840, Loss: 4.9331, Loss AR: 4.3868, Loss CFM: 0.5463, Grad Norm: 4.0914, LR: 0.000020
Epoch 0, Iteration 4850, Loss: 5.2135, Loss AR: 4.7105, Loss CFM: 0.5030, Grad Norm: 4.1283, LR: 0.000020
Epoch 0, Iteration 4860, Loss: 5.1175, Loss AR: 4.5679, Loss CFM: 0.5496, Grad Norm: 3.8009, LR: 0.000020
Epoch 0, Iteration 4870, Loss: 5.0650, Loss AR: 4.4942, Loss CFM: 0.5708, Grad Norm: 5.0449, LR: 0.000020
Epoch 0, Iteration 4880, Loss: 4.9257, Loss AR: 4.4514, Loss CFM: 0.4743, Grad Norm: 3.6844, LR: 0.000020
Epoch 0, Iteration 4890, Loss: 5.1983, Loss AR: 4.6170, Loss CFM: 0.5814, Grad Norm: 4.0004, LR: 0.000020
Epoch 0, Iteration 4900, Loss: 5.3436, Loss AR: 4.7308, Loss CFM: 0.6128, Grad Norm: 3.6840, LR: 0.000020
Epoch 0, Iteration 4910, Loss: 4.9247, Loss AR: 4.4697, Loss CFM: 0.4550, Grad Norm: 3.6816, LR: 0.000020
Epoch 0, Iteration 4920, Loss: 5.2755, Loss AR: 4.7229, Loss CFM: 0.5526, Grad Norm: 5.3202, LR: 0.000020
Epoch 0, Iteration 4930, Loss: 5.1092, Loss AR: 4.5611, Loss CFM: 0.5481, Grad Norm: 5.7048, LR: 0.000020
Epoch 0, Iteration 4940, Loss: 5.2896, Loss AR: 4.6946, Loss CFM: 0.5950, Grad Norm: 4.8571, LR: 0.000020
Epoch 0, Iteration 4950, Loss: 5.1892, Loss AR: 4.5722, Loss CFM: 0.6170, Grad Norm: 3.2345, LR: 0.000020
Epoch 0, Iteration 4960, Loss: 5.0966, Loss AR: 4.5187, Loss CFM: 0.5779, Grad Norm: 3.3062, LR: 0.000020
Epoch 0, Iteration 4970, Loss: 5.1659, Loss AR: 4.6225, Loss CFM: 0.5434, Grad Norm: 3.2832, LR: 0.000020
Epoch 0, Iteration 4980, Loss: 5.2420, Loss AR: 4.6607, Loss CFM: 0.5813, Grad Norm: 4.3487, LR: 0.000020
Epoch 0, Iteration 4990, Loss: 5.0565, Loss AR: 4.5064, Loss CFM: 0.5501, Grad Norm: 5.4546, LR: 0.000020
Epoch 0, Iteration 5000, Loss: 5.0519, Loss AR: 4.6252, Loss CFM: 0.4267, Grad Norm: 4.6973, LR: 0.000020
Epoch 0, Iteration 5010, Loss: 5.5478, Loss AR: 4.9422, Loss CFM: 0.6056, Grad Norm: 4.3179, LR: 0.000020
Epoch 0, Iteration 5020, Loss: 5.2744, Loss AR: 4.7206, Loss CFM: 0.5538, Grad Norm: 5.0857, LR: 0.000020
Epoch 0, Iteration 5030, Loss: 5.0181, Loss AR: 4.4570, Loss CFM: 0.5611, Grad Norm: 3.6957, LR: 0.000020
Epoch 0, Iteration 5040, Loss: 5.2829, Loss AR: 4.7669, Loss CFM: 0.5160, Grad Norm: 4.4115, LR: 0.000020
Epoch 0, Iteration 5050, Loss: 5.1816, Loss AR: 4.7171, Loss CFM: 0.4644, Grad Norm: 4.1101, LR: 0.000020
Epoch 0, Iteration 5060, Loss: 5.3233, Loss AR: 4.8280, Loss CFM: 0.4953, Grad Norm: 4.2455, LR: 0.000020
Epoch 0, Iteration 5070, Loss: 5.0604, Loss AR: 4.4941, Loss CFM: 0.5664, Grad Norm: 3.1916, LR: 0.000020
Epoch 0, Iteration 5080, Loss: 5.0276, Loss AR: 4.5044, Loss CFM: 0.5231, Grad Norm: 3.9434, LR: 0.000020
Epoch 0, Iteration 5090, Loss: 5.4073, Loss AR: 4.8480, Loss CFM: 0.5593, Grad Norm: 3.3751, LR: 0.000020
Epoch 0, Iteration 5100, Loss: 4.9583, Loss AR: 4.3869, Loss CFM: 0.5714, Grad Norm: 4.8997, LR: 0.000020
Epoch 0, Iteration 5110, Loss: 5.4081, Loss AR: 4.8449, Loss CFM: 0.5631, Grad Norm: 4.5885, LR: 0.000020
Epoch 0, Iteration 5120, Loss: 5.0381, Loss AR: 4.5711, Loss CFM: 0.4671, Grad Norm: 3.0213, LR: 0.000020
Epoch 0, Iteration 5130, Loss: 4.9776, Loss AR: 4.4204, Loss CFM: 0.5571, Grad Norm: 3.7117, LR: 0.000020
Epoch 0, Iteration 5140, Loss: 5.0046, Loss AR: 4.4977, Loss CFM: 0.5068, Grad Norm: 3.8412, LR: 0.000020
Epoch 0, Iteration 5150, Loss: 5.3302, Loss AR: 4.7968, Loss CFM: 0.5334, Grad Norm: 4.7893, LR: 0.000020
Epoch 0, Iteration 5160, Loss: 5.0951, Loss AR: 4.5315, Loss CFM: 0.5636, Grad Norm: 4.2989, LR: 0.000020
Epoch 0, Iteration 5170, Loss: 5.0086, Loss AR: 4.4223, Loss CFM: 0.5863, Grad Norm: 2.9125, LR: 0.000020
Epoch 0, Iteration 5180, Loss: 5.0705, Loss AR: 4.5561, Loss CFM: 0.5143, Grad Norm: 3.9698, LR: 0.000020
Epoch 0, Iteration 5190, Loss: 5.0335, Loss AR: 4.4422, Loss CFM: 0.5913, Grad Norm: 3.7524, LR: 0.000020
Epoch 0, Iteration 5200, Loss: 4.9898, Loss AR: 4.4155, Loss CFM: 0.5743, Grad Norm: 3.9590, LR: 0.000020
Epoch 0, Iteration 5210, Loss: 4.9626, Loss AR: 4.4435, Loss CFM: 0.5191, Grad Norm: 4.5749, LR: 0.000020
Epoch 0, Iteration 5220, Loss: 5.0550, Loss AR: 4.6247, Loss CFM: 0.4303, Grad Norm: 3.8406, LR: 0.000020
Epoch 0, Iteration 5230, Loss: 5.2456, Loss AR: 4.7486, Loss CFM: 0.4970, Grad Norm: 4.1544, LR: 0.000020
Epoch 0, Iteration 5240, Loss: 5.0785, Loss AR: 4.5074, Loss CFM: 0.5712, Grad Norm: 4.2960, LR: 0.000020
Epoch 0, Iteration 5250, Loss: 4.7871, Loss AR: 4.2567, Loss CFM: 0.5304, Grad Norm: 4.4241, LR: 0.000020
Epoch 0, Iteration 5260, Loss: 4.8035, Loss AR: 4.2982, Loss CFM: 0.5053, Grad Norm: 4.5559, LR: 0.000020
Epoch 0, Iteration 5270, Loss: 5.2044, Loss AR: 4.6737, Loss CFM: 0.5308, Grad Norm: 4.2072, LR: 0.000020
Epoch 0, Iteration 5280, Loss: 4.9545, Loss AR: 4.4040, Loss CFM: 0.5505, Grad Norm: 5.0241, LR: 0.000020
Epoch 0, Iteration 5290, Loss: 5.3846, Loss AR: 4.7499, Loss CFM: 0.6348, Grad Norm: 4.2712, LR: 0.000020
Epoch 0, Iteration 5300, Loss: 5.2114, Loss AR: 4.7108, Loss CFM: 0.5006, Grad Norm: 3.4437, LR: 0.000020
Epoch 0, Iteration 5310, Loss: 5.3842, Loss AR: 4.7156, Loss CFM: 0.6686, Grad Norm: 6.6381, LR: 0.000020
Epoch 0, Iteration 5320, Loss: 5.2045, Loss AR: 4.5912, Loss CFM: 0.6133, Grad Norm: 3.8691, LR: 0.000020
Epoch 0, Iteration 5330, Loss: 5.2034, Loss AR: 4.5946, Loss CFM: 0.6088, Grad Norm: 3.7678, LR: 0.000020
Epoch 0, Iteration 5340, Loss: 5.2786, Loss AR: 4.7082, Loss CFM: 0.5703, Grad Norm: 4.3354, LR: 0.000020
Epoch 0, Iteration 5350, Loss: 5.1370, Loss AR: 4.6507, Loss CFM: 0.4863, Grad Norm: 6.3461, LR: 0.000020
Epoch 0, Iteration 5360, Loss: 5.3167, Loss AR: 4.8270, Loss CFM: 0.4897, Grad Norm: 4.1551, LR: 0.000020
Epoch 0, Iteration 5370, Loss: 5.1687, Loss AR: 4.6314, Loss CFM: 0.5373, Grad Norm: 3.7250, LR: 0.000020
Epoch 0, Iteration 5380, Loss: 5.1008, Loss AR: 4.5202, Loss CFM: 0.5806, Grad Norm: 4.4286, LR: 0.000020
Epoch 0, Iteration 5390, Loss: 5.2386, Loss AR: 4.6849, Loss CFM: 0.5537, Grad Norm: 3.4784, LR: 0.000020
Epoch 0, Iteration 5400, Loss: 5.1054, Loss AR: 4.5663, Loss CFM: 0.5392, Grad Norm: 3.6991, LR: 0.000020
Epoch 0, Iteration 5410, Loss: 5.1998, Loss AR: 4.6807, Loss CFM: 0.5191, Grad Norm: 5.8943, LR: 0.000020
Epoch 0, Iteration 5420, Loss: 5.3691, Loss AR: 4.8670, Loss CFM: 0.5021, Grad Norm: 3.4669, LR: 0.000020
Epoch 0, Iteration 5430, Loss: 5.1193, Loss AR: 4.6105, Loss CFM: 0.5088, Grad Norm: 5.4009, LR: 0.000020
Epoch 0, Iteration 5440, Loss: 5.2984, Loss AR: 4.7609, Loss CFM: 0.5376, Grad Norm: 3.4040, LR: 0.000020
Epoch 0, Iteration 5450, Loss: 5.1131, Loss AR: 4.5990, Loss CFM: 0.5141, Grad Norm: 3.4539, LR: 0.000020
Epoch 0, Iteration 5460, Loss: 5.1614, Loss AR: 4.6214, Loss CFM: 0.5400, Grad Norm: 4.2318, LR: 0.000020
Epoch 0, Iteration 5470, Loss: 5.4235, Loss AR: 4.8615, Loss CFM: 0.5620, Grad Norm: 4.4349, LR: 0.000020
Epoch 0, Iteration 5480, Loss: 5.0686, Loss AR: 4.5323, Loss CFM: 0.5363, Grad Norm: 4.9020, LR: 0.000020
Epoch 0, Iteration 5490, Loss: 5.0192, Loss AR: 4.4564, Loss CFM: 0.5628, Grad Norm: 3.5299, LR: 0.000020
Epoch 0, Iteration 5500, Loss: 5.2048, Loss AR: 4.5930, Loss CFM: 0.6118, Grad Norm: 3.1278, LR: 0.000020
Epoch 0, Iteration 5510, Loss: 5.3022, Loss AR: 4.7801, Loss CFM: 0.5222, Grad Norm: 4.0718, LR: 0.000020
Epoch 0, Iteration 5520, Loss: 5.0840, Loss AR: 4.4756, Loss CFM: 0.6084, Grad Norm: 6.2534, LR: 0.000020
Epoch 0, Iteration 5530, Loss: 4.9518, Loss AR: 4.3323, Loss CFM: 0.6195, Grad Norm: 4.0962, LR: 0.000020
Epoch 0, Iteration 5540, Loss: 4.9504, Loss AR: 4.4588, Loss CFM: 0.4916, Grad Norm: 4.4140, LR: 0.000020
Epoch 0, Iteration 5550, Loss: 5.1567, Loss AR: 4.6206, Loss CFM: 0.5361, Grad Norm: 3.8498, LR: 0.000020
Epoch 0, Iteration 5560, Loss: 5.3632, Loss AR: 4.8304, Loss CFM: 0.5328, Grad Norm: 3.9274, LR: 0.000020
Epoch 0, Iteration 5570, Loss: 5.5295, Loss AR: 4.9335, Loss CFM: 0.5960, Grad Norm: 3.6868, LR: 0.000020
Epoch 0, Iteration 5580, Loss: 4.9435, Loss AR: 4.3365, Loss CFM: 0.6070, Grad Norm: 3.3362, LR: 0.000020
Epoch 0, Iteration 5590, Loss: 4.8178, Loss AR: 4.2356, Loss CFM: 0.5821, Grad Norm: 2.9792, LR: 0.000020
Epoch 0, Iteration 5600, Loss: 5.1358, Loss AR: 4.4978, Loss CFM: 0.6380, Grad Norm: 3.9210, LR: 0.000020
Epoch 0, Iteration 5610, Loss: 5.3125, Loss AR: 4.6968, Loss CFM: 0.6157, Grad Norm: 4.4705, LR: 0.000020
Epoch 0, Iteration 5620, Loss: 5.2227, Loss AR: 4.6993, Loss CFM: 0.5234, Grad Norm: 3.5269, LR: 0.000020
Epoch 0, Iteration 5630, Loss: 5.3464, Loss AR: 4.7679, Loss CFM: 0.5785, Grad Norm: 6.1030, LR: 0.000020
Epoch 0, Iteration 5640, Loss: 5.1337, Loss AR: 4.5673, Loss CFM: 0.5663, Grad Norm: 4.8825, LR: 0.000020
Epoch 0, Iteration 5650, Loss: 5.0507, Loss AR: 4.5541, Loss CFM: 0.4966, Grad Norm: 4.3156, LR: 0.000020
Epoch 0, Iteration 5660, Loss: 5.0618, Loss AR: 4.5386, Loss CFM: 0.5232, Grad Norm: 4.7007, LR: 0.000020
Epoch 0, Iteration 5670, Loss: 5.0989, Loss AR: 4.5574, Loss CFM: 0.5415, Grad Norm: 3.5839, LR: 0.000020
Epoch 0, Iteration 5680, Loss: 5.2288, Loss AR: 4.6967, Loss CFM: 0.5321, Grad Norm: 4.9315, LR: 0.000020
Epoch 0, Iteration 5690, Loss: 5.3337, Loss AR: 4.7085, Loss CFM: 0.6252, Grad Norm: 3.9858, LR: 0.000020
Epoch 0, Iteration 5700, Loss: 5.2313, Loss AR: 4.6775, Loss CFM: 0.5538, Grad Norm: 3.7665, LR: 0.000020
Epoch 0, Iteration 5710, Loss: 5.0724, Loss AR: 4.5410, Loss CFM: 0.5314, Grad Norm: 5.4109, LR: 0.000020
Epoch 0, Iteration 5720, Loss: 4.9929, Loss AR: 4.4208, Loss CFM: 0.5721, Grad Norm: 3.6188, LR: 0.000020
Epoch 0, Iteration 5730, Loss: 5.1186, Loss AR: 4.5843, Loss CFM: 0.5343, Grad Norm: 4.8427, LR: 0.000020
Epoch 0, Iteration 5740, Loss: 4.9561, Loss AR: 4.4791, Loss CFM: 0.4770, Grad Norm: 4.7657, LR: 0.000020
Epoch 0, Iteration 5750, Loss: 5.3329, Loss AR: 4.7677, Loss CFM: 0.5652, Grad Norm: 3.4938, LR: 0.000020
Epoch 0, Iteration 5760, Loss: 5.1837, Loss AR: 4.6863, Loss CFM: 0.4974, Grad Norm: 5.1220, LR: 0.000020
Epoch 0, Iteration 5770, Loss: 5.0571, Loss AR: 4.5099, Loss CFM: 0.5471, Grad Norm: 3.4548, LR: 0.000020
Epoch 0, Iteration 5780, Loss: 4.9860, Loss AR: 4.4401, Loss CFM: 0.5459, Grad Norm: 3.6487, LR: 0.000020
Epoch 0, Iteration 5790, Loss: 5.1994, Loss AR: 4.6521, Loss CFM: 0.5473, Grad Norm: 3.4299, LR: 0.000020
Epoch 0, Iteration 5800, Loss: 5.1405, Loss AR: 4.5771, Loss CFM: 0.5634, Grad Norm: 4.6439, LR: 0.000020
Epoch 0, Iteration 5810, Loss: 5.2557, Loss AR: 4.7234, Loss CFM: 0.5323, Grad Norm: 3.0877, LR: 0.000020
Epoch 0, Iteration 5820, Loss: 5.5004, Loss AR: 4.8220, Loss CFM: 0.6784, Grad Norm: 4.7354, LR: 0.000020
Epoch 0, Iteration 5830, Loss: 4.9833, Loss AR: 4.4699, Loss CFM: 0.5134, Grad Norm: 3.6430, LR: 0.000020
Epoch 0, Iteration 5840, Loss: 5.2528, Loss AR: 4.7294, Loss CFM: 0.5233, Grad Norm: 4.4143, LR: 0.000020
Epoch 0, Iteration 5850, Loss: 5.0748, Loss AR: 4.5279, Loss CFM: 0.5469, Grad Norm: 3.6993, LR: 0.000020
Epoch 0, Iteration 5860, Loss: 5.1972, Loss AR: 4.6057, Loss CFM: 0.5915, Grad Norm: 5.6847, LR: 0.000020
Epoch 0, Iteration 5870, Loss: 5.0203, Loss AR: 4.5323, Loss CFM: 0.4880, Grad Norm: 3.8068, LR: 0.000020
Epoch 0, Iteration 5880, Loss: 5.0370, Loss AR: 4.5384, Loss CFM: 0.4986, Grad Norm: 3.9995, LR: 0.000020
Epoch 0, Iteration 5890, Loss: 5.0961, Loss AR: 4.5451, Loss CFM: 0.5511, Grad Norm: 3.5928, LR: 0.000020
Epoch 0, Iteration 5900, Loss: 5.0346, Loss AR: 4.4901, Loss CFM: 0.5446, Grad Norm: 3.9876, LR: 0.000020
Epoch 0, Iteration 5910, Loss: 5.0820, Loss AR: 4.5417, Loss CFM: 0.5403, Grad Norm: 3.1711, LR: 0.000020
Epoch 0, Iteration 5920, Loss: 5.2228, Loss AR: 4.5156, Loss CFM: 0.7072, Grad Norm: 4.0234, LR: 0.000020
Epoch 0, Iteration 5930, Loss: 5.2386, Loss AR: 4.7356, Loss CFM: 0.5029, Grad Norm: 3.4977, LR: 0.000020
Epoch 0, Iteration 5940, Loss: 5.3127, Loss AR: 4.7782, Loss CFM: 0.5345, Grad Norm: 4.7608, LR: 0.000020
Epoch 0, Iteration 5950, Loss: 5.2000, Loss AR: 4.6884, Loss CFM: 0.5117, Grad Norm: 4.2956, LR: 0.000020
Epoch 0, Iteration 5960, Loss: 5.1019, Loss AR: 4.5660, Loss CFM: 0.5358, Grad Norm: 3.8026, LR: 0.000020
Epoch 0, Iteration 5970, Loss: 5.0858, Loss AR: 4.5444, Loss CFM: 0.5415, Grad Norm: 3.6924, LR: 0.000020
Epoch 0, Iteration 5980, Loss: 5.0293, Loss AR: 4.5065, Loss CFM: 0.5228, Grad Norm: 3.5017, LR: 0.000020
Epoch 0, Iteration 5990, Loss: 5.4308, Loss AR: 4.8234, Loss CFM: 0.6074, Grad Norm: 3.8901, LR: 0.000020
Epoch 0, Iteration 6000, Loss: 5.1663, Loss AR: 4.6231, Loss CFM: 0.5432, Grad Norm: 4.1875, LR: 0.000020
Epoch 0, Iteration 6010, Loss: 5.0635, Loss AR: 4.5998, Loss CFM: 0.4637, Grad Norm: 4.5447, LR: 0.000020
Epoch 0, Iteration 6020, Loss: 4.9921, Loss AR: 4.4594, Loss CFM: 0.5326, Grad Norm: 3.2315, LR: 0.000020
Epoch 0, Iteration 6030, Loss: 5.0811, Loss AR: 4.5172, Loss CFM: 0.5639, Grad Norm: 3.8761, LR: 0.000020
Epoch 0, Iteration 6040, Loss: 5.1247, Loss AR: 4.5803, Loss CFM: 0.5444, Grad Norm: 3.2858, LR: 0.000020
Epoch 0, Iteration 6050, Loss: 5.2236, Loss AR: 4.6938, Loss CFM: 0.5298, Grad Norm: 4.7414, LR: 0.000020
Epoch 0, Iteration 6060, Loss: 5.2269, Loss AR: 4.6706, Loss CFM: 0.5563, Grad Norm: 5.0035, LR: 0.000020
Epoch 0, Iteration 6070, Loss: 5.6425, Loss AR: 5.0754, Loss CFM: 0.5671, Grad Norm: 5.2226, LR: 0.000020
Epoch 0, Iteration 6080, Loss: 5.3310, Loss AR: 4.8184, Loss CFM: 0.5127, Grad Norm: 4.9426, LR: 0.000020
Epoch 0, Iteration 6090, Loss: 4.9776, Loss AR: 4.4492, Loss CFM: 0.5284, Grad Norm: 5.1043, LR: 0.000020
Epoch 0, Iteration 6100, Loss: 5.3443, Loss AR: 4.7733, Loss CFM: 0.5711, Grad Norm: 3.9529, LR: 0.000020
Epoch 0, Iteration 6110, Loss: 5.3924, Loss AR: 4.8690, Loss CFM: 0.5234, Grad Norm: 5.2329, LR: 0.000020
Epoch 0, Iteration 6120, Loss: 5.0652, Loss AR: 4.5086, Loss CFM: 0.5565, Grad Norm: 2.9334, LR: 0.000020
Epoch 0, Iteration 6130, Loss: 5.1450, Loss AR: 4.6405, Loss CFM: 0.5045, Grad Norm: 4.5272, LR: 0.000020
Epoch 0, Iteration 6140, Loss: 4.9516, Loss AR: 4.3885, Loss CFM: 0.5631, Grad Norm: 4.4135, LR: 0.000020
Epoch 0, Iteration 6150, Loss: 5.1818, Loss AR: 4.6166, Loss CFM: 0.5653, Grad Norm: 4.3237, LR: 0.000020
Epoch 0, Iteration 6160, Loss: 5.3098, Loss AR: 4.8029, Loss CFM: 0.5070, Grad Norm: 4.2339, LR: 0.000020
Epoch 0, Iteration 6170, Loss: 5.1513, Loss AR: 4.5591, Loss CFM: 0.5922, Grad Norm: 4.5624, LR: 0.000020
Epoch 0, Iteration 6180, Loss: 5.2802, Loss AR: 4.7426, Loss CFM: 0.5376, Grad Norm: 3.8962, LR: 0.000020
Epoch 0, Iteration 6190, Loss: 5.1829, Loss AR: 4.6432, Loss CFM: 0.5397, Grad Norm: 4.4094, LR: 0.000020
Epoch 0, Iteration 6200, Loss: 5.2502, Loss AR: 4.7128, Loss CFM: 0.5375, Grad Norm: 5.3501, LR: 0.000020
Epoch 0, Iteration 6210, Loss: 5.4088, Loss AR: 4.8340, Loss CFM: 0.5748, Grad Norm: 3.6581, LR: 0.000020
Epoch 0, Iteration 6220, Loss: 5.0138, Loss AR: 4.4571, Loss CFM: 0.5567, Grad Norm: 4.3737, LR: 0.000020
Epoch 0, Iteration 6230, Loss: 5.3098, Loss AR: 4.8114, Loss CFM: 0.4983, Grad Norm: 3.9318, LR: 0.000020
Epoch 0, Iteration 6240, Loss: 5.1959, Loss AR: 4.5879, Loss CFM: 0.6080, Grad Norm: 4.1029, LR: 0.000020
Epoch 0, Iteration 6250, Loss: 5.1862, Loss AR: 4.6648, Loss CFM: 0.5214, Grad Norm: 5.1517, LR: 0.000020
Epoch 0, Iteration 6260, Loss: 5.1190, Loss AR: 4.5193, Loss CFM: 0.5997, Grad Norm: 3.5953, LR: 0.000020
Epoch 0, Iteration 6270, Loss: 5.4475, Loss AR: 4.9034, Loss CFM: 0.5440, Grad Norm: 7.1714, LR: 0.000020
Epoch 0, Iteration 6280, Loss: 5.0837, Loss AR: 4.4906, Loss CFM: 0.5930, Grad Norm: 5.3045, LR: 0.000020
Epoch 0, Iteration 6290, Loss: 5.2506, Loss AR: 4.6890, Loss CFM: 0.5616, Grad Norm: 4.9415, LR: 0.000020
Epoch 0, Iteration 6300, Loss: 5.2566, Loss AR: 4.7238, Loss CFM: 0.5328, Grad Norm: 4.5269, LR: 0.000020
Epoch 0, Iteration 6310, Loss: 5.0661, Loss AR: 4.5107, Loss CFM: 0.5555, Grad Norm: 3.0607, LR: 0.000020
Epoch 0, Iteration 6320, Loss: 5.2378, Loss AR: 4.6563, Loss CFM: 0.5814, Grad Norm: 4.8560, LR: 0.000020
Epoch 0, Iteration 6330, Loss: 5.1246, Loss AR: 4.5111, Loss CFM: 0.6136, Grad Norm: 3.4441, LR: 0.000019
Epoch 0, Iteration 6340, Loss: 5.0000, Loss AR: 4.4934, Loss CFM: 0.5066, Grad Norm: 3.9527, LR: 0.000019
Epoch 0, Iteration 6350, Loss: 4.9555, Loss AR: 4.4465, Loss CFM: 0.5091, Grad Norm: 3.1917, LR: 0.000019
Epoch 0, Iteration 6360, Loss: 5.1817, Loss AR: 4.6401, Loss CFM: 0.5416, Grad Norm: 3.4315, LR: 0.000019
Epoch 0, Iteration 6370, Loss: 5.0669, Loss AR: 4.4390, Loss CFM: 0.6279, Grad Norm: 3.8356, LR: 0.000019
Epoch 0, Iteration 6380, Loss: 5.2818, Loss AR: 4.7406, Loss CFM: 0.5412, Grad Norm: 4.9514, LR: 0.000019
Epoch 0, Iteration 6390, Loss: 5.1998, Loss AR: 4.6467, Loss CFM: 0.5531, Grad Norm: 6.8049, LR: 0.000019
Epoch 0, Iteration 6400, Loss: 5.1251, Loss AR: 4.4100, Loss CFM: 0.7151, Grad Norm: 6.5617, LR: 0.000019
Epoch 0, Iteration 6410, Loss: 5.1825, Loss AR: 4.5872, Loss CFM: 0.5953, Grad Norm: 4.1752, LR: 0.000019
Epoch 0, Iteration 6420, Loss: 4.8497, Loss AR: 4.3044, Loss CFM: 0.5454, Grad Norm: 4.4047, LR: 0.000019
Epoch 0, Iteration 6430, Loss: 5.3639, Loss AR: 4.7919, Loss CFM: 0.5720, Grad Norm: 4.3930, LR: 0.000019
Epoch 0, Iteration 6440, Loss: 5.1205, Loss AR: 4.5042, Loss CFM: 0.6162, Grad Norm: 4.9527, LR: 0.000019
Epoch 0, Iteration 6450, Loss: 5.3300, Loss AR: 4.8054, Loss CFM: 0.5247, Grad Norm: 3.7313, LR: 0.000019
Epoch 0, Iteration 6460, Loss: 5.2083, Loss AR: 4.6725, Loss CFM: 0.5357, Grad Norm: 3.4169, LR: 0.000019
Epoch 0, Iteration 6470, Loss: 5.1211, Loss AR: 4.5486, Loss CFM: 0.5725, Grad Norm: 4.3771, LR: 0.000019
Epoch 0, Iteration 6480, Loss: 5.2103, Loss AR: 4.6831, Loss CFM: 0.5272, Grad Norm: 4.7659, LR: 0.000019
Epoch 0, Iteration 6490, Loss: 5.2766, Loss AR: 4.7272, Loss CFM: 0.5494, Grad Norm: 5.0404, LR: 0.000019
Epoch 0, Iteration 6500, Loss: 5.3015, Loss AR: 4.7867, Loss CFM: 0.5149, Grad Norm: 7.1245, LR: 0.000019
Epoch 0, Iteration 6510, Loss: 5.1856, Loss AR: 4.6218, Loss CFM: 0.5638, Grad Norm: 3.4542, LR: 0.000019
Epoch 0, Iteration 6520, Loss: 5.1048, Loss AR: 4.5573, Loss CFM: 0.5475, Grad Norm: 3.0890, LR: 0.000019
Epoch 0, Iteration 6530, Loss: 5.0344, Loss AR: 4.5341, Loss CFM: 0.5003, Grad Norm: 4.0986, LR: 0.000019
Epoch 0, Iteration 6540, Loss: 5.2314, Loss AR: 4.6765, Loss CFM: 0.5549, Grad Norm: 3.8998, LR: 0.000019
Epoch 0, Iteration 6550, Loss: 5.2216, Loss AR: 4.6804, Loss CFM: 0.5412, Grad Norm: 3.7790, LR: 0.000019
Epoch 0, Iteration 6560, Loss: 5.4278, Loss AR: 4.8579, Loss CFM: 0.5699, Grad Norm: 3.8183, LR: 0.000019
Epoch 0, Iteration 6570, Loss: 5.2744, Loss AR: 4.6990, Loss CFM: 0.5753, Grad Norm: 4.4646, LR: 0.000019
Epoch 0, Iteration 6580, Loss: 5.1689, Loss AR: 4.5747, Loss CFM: 0.5942, Grad Norm: 6.1790, LR: 0.000019
Epoch 0, Iteration 6590, Loss: 5.1281, Loss AR: 4.6002, Loss CFM: 0.5279, Grad Norm: 4.1944, LR: 0.000019
Epoch 0, Iteration 6600, Loss: 5.1602, Loss AR: 4.5919, Loss CFM: 0.5683, Grad Norm: 3.7127, LR: 0.000019
Epoch 0, Iteration 6610, Loss: 5.1095, Loss AR: 4.5505, Loss CFM: 0.5590, Grad Norm: 4.0452, LR: 0.000019
Epoch 0, Iteration 6620, Loss: 4.9973, Loss AR: 4.4364, Loss CFM: 0.5608, Grad Norm: 3.9762, LR: 0.000019
Epoch 0, Iteration 6630, Loss: 5.4111, Loss AR: 4.7722, Loss CFM: 0.6389, Grad Norm: 5.0188, LR: 0.000019
Epoch 0, Iteration 6640, Loss: 5.5344, Loss AR: 4.9061, Loss CFM: 0.6283, Grad Norm: 3.6843, LR: 0.000019
Epoch 0, Iteration 6650, Loss: 5.1319, Loss AR: 4.5258, Loss CFM: 0.6060, Grad Norm: 4.8867, LR: 0.000019
Epoch 0, Iteration 6660, Loss: 5.2358, Loss AR: 4.7039, Loss CFM: 0.5318, Grad Norm: 4.1711, LR: 0.000019
Epoch 0, Iteration 6670, Loss: 5.2529, Loss AR: 4.7448, Loss CFM: 0.5081, Grad Norm: 3.6177, LR: 0.000019
Epoch 0, Iteration 6680, Loss: 5.2021, Loss AR: 4.6696, Loss CFM: 0.5325, Grad Norm: 3.5628, LR: 0.000019
Epoch 0, Iteration 6690, Loss: 5.1543, Loss AR: 4.6095, Loss CFM: 0.5449, Grad Norm: 3.7097, LR: 0.000019
Epoch 0, Iteration 6700, Loss: 4.7817, Loss AR: 4.2611, Loss CFM: 0.5206, Grad Norm: 2.4329, LR: 0.000019
Epoch 0, Iteration 6710, Loss: 5.1425, Loss AR: 4.5179, Loss CFM: 0.6246, Grad Norm: 5.2110, LR: 0.000019
Epoch 0, Iteration 6720, Loss: 5.2087, Loss AR: 4.5725, Loss CFM: 0.6362, Grad Norm: 5.0567, LR: 0.000019
Epoch 0, Iteration 6730, Loss: 5.3565, Loss AR: 4.8012, Loss CFM: 0.5553, Grad Norm: 3.6151, LR: 0.000019
Epoch 0, Iteration 6740, Loss: 5.2483, Loss AR: 4.7125, Loss CFM: 0.5359, Grad Norm: 4.3078, LR: 0.000019
Epoch 0, Iteration 6750, Loss: 5.2012, Loss AR: 4.5893, Loss CFM: 0.6119, Grad Norm: 6.3375, LR: 0.000019
Epoch 0, Iteration 6760, Loss: 5.2373, Loss AR: 4.6837, Loss CFM: 0.5536, Grad Norm: 5.3178, LR: 0.000019
Epoch 0, Iteration 6770, Loss: 5.4011, Loss AR: 4.8433, Loss CFM: 0.5578, Grad Norm: 3.8942, LR: 0.000019
Epoch 0, Iteration 6780, Loss: 5.2322, Loss AR: 4.6072, Loss CFM: 0.6250, Grad Norm: 4.6330, LR: 0.000019
Epoch 0, Iteration 6790, Loss: 5.0266, Loss AR: 4.4407, Loss CFM: 0.5859, Grad Norm: 3.8813, LR: 0.000019
Epoch 0, Iteration 6800, Loss: 5.3094, Loss AR: 4.7102, Loss CFM: 0.5992, Grad Norm: 6.0292, LR: 0.000019
Epoch 0, Iteration 6810, Loss: 5.2314, Loss AR: 4.6178, Loss CFM: 0.6136, Grad Norm: 4.1249, LR: 0.000019
Epoch 0, Iteration 6820, Loss: 5.0917, Loss AR: 4.5702, Loss CFM: 0.5216, Grad Norm: 3.8374, LR: 0.000019
Epoch 0, Iteration 6830, Loss: 5.1918, Loss AR: 4.6858, Loss CFM: 0.5060, Grad Norm: 4.1714, LR: 0.000019
Epoch 0, Iteration 6840, Loss: 5.1743, Loss AR: 4.6566, Loss CFM: 0.5177, Grad Norm: 4.7135, LR: 0.000019
Epoch 0, Iteration 6850, Loss: 5.2902, Loss AR: 4.7573, Loss CFM: 0.5329, Grad Norm: 5.0818, LR: 0.000019
Epoch 0, Iteration 6860, Loss: 5.1983, Loss AR: 4.6148, Loss CFM: 0.5835, Grad Norm: 4.0254, LR: 0.000019
Epoch 0, Iteration 6870, Loss: 5.0962, Loss AR: 4.5304, Loss CFM: 0.5658, Grad Norm: 3.7727, LR: 0.000019
Epoch 0, Iteration 6880, Loss: 5.1014, Loss AR: 4.6096, Loss CFM: 0.4918, Grad Norm: 4.0797, LR: 0.000019
Epoch 0, Iteration 6890, Loss: 5.0232, Loss AR: 4.5518, Loss CFM: 0.4714, Grad Norm: 4.4166, LR: 0.000019
Epoch 0, Iteration 6900, Loss: 5.2193, Loss AR: 4.6112, Loss CFM: 0.6080, Grad Norm: 3.9558, LR: 0.000019
Epoch 0, Iteration 6910, Loss: 4.9282, Loss AR: 4.4239, Loss CFM: 0.5043, Grad Norm: 3.4236, LR: 0.000019
Epoch 0, Iteration 6920, Loss: 5.1317, Loss AR: 4.6616, Loss CFM: 0.4702, Grad Norm: 3.9008, LR: 0.000019
Epoch 0, Iteration 6930, Loss: 5.1752, Loss AR: 4.6810, Loss CFM: 0.4943, Grad Norm: 4.4829, LR: 0.000019
Epoch 0, Iteration 6940, Loss: 5.0129, Loss AR: 4.5246, Loss CFM: 0.4882, Grad Norm: 3.7114, LR: 0.000019
Epoch 0, Iteration 6950, Loss: 5.3669, Loss AR: 4.7302, Loss CFM: 0.6367, Grad Norm: 10.1573, LR: 0.000019
Epoch 0, Iteration 6960, Loss: 5.1626, Loss AR: 4.5510, Loss CFM: 0.6116, Grad Norm: 4.6152, LR: 0.000019
Epoch 0, Iteration 6970, Loss: 5.1129, Loss AR: 4.5425, Loss CFM: 0.5704, Grad Norm: 4.8463, LR: 0.000019
Epoch 0, Iteration 6980, Loss: 4.9803, Loss AR: 4.4394, Loss CFM: 0.5409, Grad Norm: 3.1916, LR: 0.000019
Epoch 0, Iteration 6990, Loss: 5.0474, Loss AR: 4.4750, Loss CFM: 0.5724, Grad Norm: 4.4176, LR: 0.000019
Epoch 0, Iteration 7000, Loss: 5.2271, Loss AR: 4.6173, Loss CFM: 0.6097, Grad Norm: 4.9717, LR: 0.000019
Epoch 0, Iteration 7010, Loss: 5.1149, Loss AR: 4.5765, Loss CFM: 0.5384, Grad Norm: 3.6523, LR: 0.000019
Epoch 0, Iteration 7020, Loss: 5.3976, Loss AR: 4.7923, Loss CFM: 0.6053, Grad Norm: 4.0559, LR: 0.000019
Epoch 0, Iteration 7030, Loss: 4.9202, Loss AR: 4.3918, Loss CFM: 0.5284, Grad Norm: 3.5635, LR: 0.000019
Epoch 0, Iteration 7040, Loss: 5.2297, Loss AR: 4.6445, Loss CFM: 0.5852, Grad Norm: 4.6854, LR: 0.000019
Epoch 0, Iteration 7050, Loss: 5.1965, Loss AR: 4.7029, Loss CFM: 0.4936, Grad Norm: 4.2550, LR: 0.000019
Epoch 0, Iteration 7060, Loss: 5.0333, Loss AR: 4.5395, Loss CFM: 0.4938, Grad Norm: 4.7420, LR: 0.000019
Epoch 0, Iteration 7070, Loss: 5.0250, Loss AR: 4.4664, Loss CFM: 0.5586, Grad Norm: 7.9312, LR: 0.000019
Epoch 0, Iteration 7080, Loss: 5.0735, Loss AR: 4.5329, Loss CFM: 0.5406, Grad Norm: 3.5955, LR: 0.000019
Epoch 0, Iteration 7090, Loss: 5.2034, Loss AR: 4.5537, Loss CFM: 0.6497, Grad Norm: 3.7197, LR: 0.000019
Epoch 0, Iteration 7100, Loss: 5.0361, Loss AR: 4.4663, Loss CFM: 0.5698, Grad Norm: 4.1020, LR: 0.000019
Epoch 0, Iteration 7110, Loss: 5.1038, Loss AR: 4.5585, Loss CFM: 0.5453, Grad Norm: 3.1392, LR: 0.000019
Epoch 0, Iteration 7120, Loss: 5.0382, Loss AR: 4.4297, Loss CFM: 0.6085, Grad Norm: 3.8268, LR: 0.000019
Epoch 0, Iteration 7130, Loss: 5.3811, Loss AR: 4.7197, Loss CFM: 0.6614, Grad Norm: 4.9527, LR: 0.000019
Epoch 0, Iteration 7140, Loss: 5.2075, Loss AR: 4.6718, Loss CFM: 0.5358, Grad Norm: 4.4918, LR: 0.000019
Epoch 0, Iteration 7150, Loss: 5.1992, Loss AR: 4.6898, Loss CFM: 0.5094, Grad Norm: 3.8708, LR: 0.000019
Epoch 0, Iteration 7160, Loss: 5.2365, Loss AR: 4.7294, Loss CFM: 0.5071, Grad Norm: 4.3968, LR: 0.000019
Epoch 0, Iteration 7170, Loss: 5.0208, Loss AR: 4.4594, Loss CFM: 0.5614, Grad Norm: 3.8493, LR: 0.000019
Epoch 0, Iteration 7180, Loss: 5.1543, Loss AR: 4.5920, Loss CFM: 0.5623, Grad Norm: 4.2152, LR: 0.000019
Epoch 0, Iteration 7190, Loss: 4.9275, Loss AR: 4.3247, Loss CFM: 0.6028, Grad Norm: 5.2517, LR: 0.000019
Epoch 0, Iteration 7200, Loss: 5.1017, Loss AR: 4.5198, Loss CFM: 0.5819, Grad Norm: 3.5767, LR: 0.000019
Epoch 0, Iteration 7210, Loss: 5.0041, Loss AR: 4.4640, Loss CFM: 0.5401, Grad Norm: 7.9798, LR: 0.000019
Epoch 0, Iteration 7220, Loss: 4.8875, Loss AR: 4.3591, Loss CFM: 0.5284, Grad Norm: 3.2535, LR: 0.000019
Epoch 0, Iteration 7230, Loss: 5.3539, Loss AR: 4.8399, Loss CFM: 0.5140, Grad Norm: 4.2439, LR: 0.000019
Epoch 0, Iteration 7240, Loss: 5.3591, Loss AR: 4.7830, Loss CFM: 0.5761, Grad Norm: 4.6494, LR: 0.000019
Epoch 0, Iteration 7250, Loss: 5.3809, Loss AR: 4.7346, Loss CFM: 0.6462, Grad Norm: 4.6956, LR: 0.000019
Epoch 0, Iteration 7260, Loss: 5.2888, Loss AR: 4.6752, Loss CFM: 0.6137, Grad Norm: 3.2310, LR: 0.000019
Epoch 0, Iteration 7270, Loss: 5.0389, Loss AR: 4.5370, Loss CFM: 0.5020, Grad Norm: 3.7440, LR: 0.000019
Epoch 0, Iteration 7280, Loss: 5.1920, Loss AR: 4.6617, Loss CFM: 0.5303, Grad Norm: 3.1691, LR: 0.000019
Epoch 0, Iteration 7290, Loss: 4.9900, Loss AR: 4.4832, Loss CFM: 0.5069, Grad Norm: 4.6201, LR: 0.000019
Epoch 0, Iteration 7300, Loss: 4.8877, Loss AR: 4.3656, Loss CFM: 0.5221, Grad Norm: 3.6555, LR: 0.000019
Epoch 0, Iteration 7310, Loss: 4.8543, Loss AR: 4.3558, Loss CFM: 0.4986, Grad Norm: 3.6917, LR: 0.000019
Epoch 0, Iteration 7320, Loss: 5.1995, Loss AR: 4.6823, Loss CFM: 0.5172, Grad Norm: 3.7158, LR: 0.000019
Epoch 0, Iteration 7330, Loss: 5.0528, Loss AR: 4.5421, Loss CFM: 0.5107, Grad Norm: 3.9829, LR: 0.000019
Epoch 0, Iteration 7340, Loss: 5.4254, Loss AR: 4.8444, Loss CFM: 0.5810, Grad Norm: 4.7016, LR: 0.000019
Epoch 0, Iteration 7350, Loss: 5.1519, Loss AR: 4.5455, Loss CFM: 0.6065, Grad Norm: 4.0872, LR: 0.000019
Epoch 0, Iteration 7360, Loss: 4.9892, Loss AR: 4.4288, Loss CFM: 0.5604, Grad Norm: 3.5145, LR: 0.000019
Epoch 0, Iteration 7370, Loss: 5.1716, Loss AR: 4.6645, Loss CFM: 0.5072, Grad Norm: 3.7427, LR: 0.000019
Epoch 0, Iteration 7380, Loss: 5.0572, Loss AR: 4.5368, Loss CFM: 0.5204, Grad Norm: 5.4854, LR: 0.000019
Epoch 0, Iteration 7390, Loss: 5.3011, Loss AR: 4.7683, Loss CFM: 0.5328, Grad Norm: 4.5261, LR: 0.000019
Epoch 0, Iteration 7400, Loss: 4.9959, Loss AR: 4.4587, Loss CFM: 0.5373, Grad Norm: 4.1975, LR: 0.000019
Epoch 0, Iteration 7410, Loss: 5.1968, Loss AR: 4.7430, Loss CFM: 0.4538, Grad Norm: 4.6629, LR: 0.000019
Epoch 0, Iteration 7420, Loss: 5.1214, Loss AR: 4.6228, Loss CFM: 0.4986, Grad Norm: 6.0855, LR: 0.000019
Epoch 0, Iteration 7430, Loss: 5.0556, Loss AR: 4.5117, Loss CFM: 0.5439, Grad Norm: 4.0704, LR: 0.000019
Epoch 0, Iteration 7440, Loss: 5.0218, Loss AR: 4.4888, Loss CFM: 0.5330, Grad Norm: 3.8548, LR: 0.000019
Epoch 0, Iteration 7450, Loss: 4.9942, Loss AR: 4.3827, Loss CFM: 0.6115, Grad Norm: 4.1147, LR: 0.000019
Epoch 0, Iteration 7460, Loss: 5.2046, Loss AR: 4.7042, Loss CFM: 0.5004, Grad Norm: 4.3753, LR: 0.000019
Epoch 0, Iteration 7470, Loss: 5.0229, Loss AR: 4.4270, Loss CFM: 0.5959, Grad Norm: 5.1354, LR: 0.000019
Epoch 0, Iteration 7480, Loss: 5.0658, Loss AR: 4.5509, Loss CFM: 0.5149, Grad Norm: 4.0452, LR: 0.000019
Epoch 0, Iteration 7490, Loss: 5.2361, Loss AR: 4.6664, Loss CFM: 0.5697, Grad Norm: 4.4250, LR: 0.000019
Epoch 0, Iteration 7500, Loss: 5.1786, Loss AR: 4.6638, Loss CFM: 0.5148, Grad Norm: 4.0970, LR: 0.000019
Epoch 0, Iteration 7510, Loss: 5.3220, Loss AR: 4.7255, Loss CFM: 0.5965, Grad Norm: 5.0927, LR: 0.000019
Epoch 0, Iteration 7520, Loss: 5.1065, Loss AR: 4.5647, Loss CFM: 0.5418, Grad Norm: 5.9343, LR: 0.000019
Epoch 0, Iteration 7530, Loss: 5.1239, Loss AR: 4.5799, Loss CFM: 0.5440, Grad Norm: 5.7337, LR: 0.000019
Epoch 0, Iteration 7540, Loss: 5.1740, Loss AR: 4.6586, Loss CFM: 0.5154, Grad Norm: 3.8538, LR: 0.000019
Epoch 0, Iteration 7550, Loss: 5.1278, Loss AR: 4.5731, Loss CFM: 0.5547, Grad Norm: 4.8404, LR: 0.000019
Epoch 0, Iteration 7560, Loss: 4.8928, Loss AR: 4.3868, Loss CFM: 0.5061, Grad Norm: 4.0398, LR: 0.000019
Epoch 0, Iteration 7570, Loss: 5.0756, Loss AR: 4.4866, Loss CFM: 0.5891, Grad Norm: 4.1161, LR: 0.000019
Epoch 0, Iteration 7580, Loss: 4.9160, Loss AR: 4.4026, Loss CFM: 0.5134, Grad Norm: 4.7492, LR: 0.000019
Epoch 0, Iteration 7590, Loss: 5.2416, Loss AR: 4.6381, Loss CFM: 0.6034, Grad Norm: 4.8868, LR: 0.000019
Epoch 0, Iteration 7600, Loss: 4.8576, Loss AR: 4.3599, Loss CFM: 0.4977, Grad Norm: 3.7380, LR: 0.000019
Epoch 0, Iteration 7610, Loss: 4.9111, Loss AR: 4.3952, Loss CFM: 0.5159, Grad Norm: 3.9014, LR: 0.000019
Epoch 0, Iteration 7620, Loss: 5.0586, Loss AR: 4.6115, Loss CFM: 0.4471, Grad Norm: 5.6117, LR: 0.000019
Epoch 0, Iteration 7630, Loss: 5.0269, Loss AR: 4.4367, Loss CFM: 0.5902, Grad Norm: 3.2754, LR: 0.000019
Epoch 0, Iteration 7640, Loss: 5.2136, Loss AR: 4.6730, Loss CFM: 0.5407, Grad Norm: 5.0639, LR: 0.000019
Epoch 0, Iteration 7650, Loss: 4.7956, Loss AR: 4.2893, Loss CFM: 0.5064, Grad Norm: 4.0777, LR: 0.000019
Epoch 0, Iteration 7660, Loss: 5.2403, Loss AR: 4.6080, Loss CFM: 0.6323, Grad Norm: 4.3468, LR: 0.000019
Epoch 0, Iteration 7670, Loss: 5.1574, Loss AR: 4.6642, Loss CFM: 0.4932, Grad Norm: 4.2993, LR: 0.000019
Epoch 0, Iteration 7680, Loss: 5.0005, Loss AR: 4.4150, Loss CFM: 0.5854, Grad Norm: 4.3153, LR: 0.000019
Epoch 0, Iteration 7690, Loss: 5.2315, Loss AR: 4.6677, Loss CFM: 0.5638, Grad Norm: 4.2230, LR: 0.000019
Epoch 0, Iteration 7700, Loss: 4.9857, Loss AR: 4.5081, Loss CFM: 0.4776, Grad Norm: 5.1436, LR: 0.000019
Epoch 0, Iteration 7710, Loss: 5.0532, Loss AR: 4.5524, Loss CFM: 0.5008, Grad Norm: 3.9466, LR: 0.000019
Epoch 0, Iteration 7720, Loss: 5.1102, Loss AR: 4.5272, Loss CFM: 0.5830, Grad Norm: 4.1372, LR: 0.000019
Epoch 0, Iteration 7730, Loss: 4.9497, Loss AR: 4.3222, Loss CFM: 0.6274, Grad Norm: 5.9233, LR: 0.000019
Epoch 0, Iteration 7740, Loss: 5.5073, Loss AR: 4.8698, Loss CFM: 0.6375, Grad Norm: 7.1654, LR: 0.000019
Epoch 0, Iteration 7750, Loss: 5.3122, Loss AR: 4.7824, Loss CFM: 0.5298, Grad Norm: 3.4089, LR: 0.000019
Epoch 0, Iteration 7760, Loss: 5.2487, Loss AR: 4.6770, Loss CFM: 0.5717, Grad Norm: 3.8937, LR: 0.000019
Epoch 0, Iteration 7770, Loss: 5.0458, Loss AR: 4.4906, Loss CFM: 0.5552, Grad Norm: 3.5717, LR: 0.000019
Epoch 0, Iteration 7780, Loss: 5.2542, Loss AR: 4.7002, Loss CFM: 0.5540, Grad Norm: 4.1528, LR: 0.000019
Epoch 0, Iteration 7790, Loss: 4.9402, Loss AR: 4.3700, Loss CFM: 0.5702, Grad Norm: 3.0790, LR: 0.000019
Epoch 0, Iteration 7800, Loss: 4.8625, Loss AR: 4.2973, Loss CFM: 0.5651, Grad Norm: 4.4536, LR: 0.000019
Epoch 0, Iteration 7810, Loss: 5.0062, Loss AR: 4.4728, Loss CFM: 0.5334, Grad Norm: 5.4296, LR: 0.000019
Epoch 0, Iteration 7820, Loss: 5.0678, Loss AR: 4.4909, Loss CFM: 0.5769, Grad Norm: 6.9036, LR: 0.000019
Epoch 0, Iteration 7830, Loss: 5.0049, Loss AR: 4.4629, Loss CFM: 0.5420, Grad Norm: 3.8629, LR: 0.000019
Epoch 0, Iteration 7840, Loss: 5.1359, Loss AR: 4.6010, Loss CFM: 0.5349, Grad Norm: 7.4330, LR: 0.000019
Epoch 0, Iteration 7850, Loss: 5.1277, Loss AR: 4.5738, Loss CFM: 0.5539, Grad Norm: 4.2355, LR: 0.000019
Epoch 0, Iteration 7860, Loss: 5.1293, Loss AR: 4.6034, Loss CFM: 0.5259, Grad Norm: 4.8568, LR: 0.000019
Epoch 0, Iteration 7870, Loss: 5.1358, Loss AR: 4.6266, Loss CFM: 0.5093, Grad Norm: 5.6911, LR: 0.000019
Epoch 0, Iteration 7880, Loss: 5.1360, Loss AR: 4.6250, Loss CFM: 0.5110, Grad Norm: 3.9614, LR: 0.000019
Epoch 0, Iteration 7890, Loss: 4.9356, Loss AR: 4.4640, Loss CFM: 0.4716, Grad Norm: 3.6272, LR: 0.000019
Epoch 0, Iteration 7900, Loss: 4.8457, Loss AR: 4.3992, Loss CFM: 0.4465, Grad Norm: 9.0939, LR: 0.000019
Epoch 0, Iteration 7910, Loss: 5.3441, Loss AR: 4.8363, Loss CFM: 0.5078, Grad Norm: 3.9968, LR: 0.000019
Epoch 0, Iteration 7920, Loss: 5.4102, Loss AR: 4.8715, Loss CFM: 0.5387, Grad Norm: 3.6651, LR: 0.000019
Epoch 0, Iteration 7930, Loss: 5.1592, Loss AR: 4.6663, Loss CFM: 0.4929, Grad Norm: 4.7423, LR: 0.000019
Epoch 0, Iteration 7940, Loss: 5.2338, Loss AR: 4.6946, Loss CFM: 0.5391, Grad Norm: 5.8394, LR: 0.000019
Epoch 0, Iteration 7950, Loss: 5.1451, Loss AR: 4.5757, Loss CFM: 0.5694, Grad Norm: 6.7132, LR: 0.000019
Epoch 0, Iteration 7960, Loss: 5.1496, Loss AR: 4.5598, Loss CFM: 0.5898, Grad Norm: 3.5772, LR: 0.000019
Epoch 0, Iteration 7970, Loss: 5.1087, Loss AR: 4.4310, Loss CFM: 0.6777, Grad Norm: 4.3027, LR: 0.000019
Epoch 0, Iteration 7980, Loss: 4.9602, Loss AR: 4.4915, Loss CFM: 0.4687, Grad Norm: 3.6467, LR: 0.000019
Epoch 0, Iteration 7990, Loss: 5.1245, Loss AR: 4.5622, Loss CFM: 0.5623, Grad Norm: 5.8588, LR: 0.000019
Epoch 0, Iteration 8000, Loss: 5.1812, Loss AR: 4.6518, Loss CFM: 0.5294, Grad Norm: 4.1396, LR: 0.000019
Epoch 0, Iteration 8010, Loss: 4.8432, Loss AR: 4.3066, Loss CFM: 0.5366, Grad Norm: 2.8481, LR: 0.000019
Epoch 0, Iteration 8020, Loss: 5.0005, Loss AR: 4.4315, Loss CFM: 0.5691, Grad Norm: 5.2006, LR: 0.000019
Epoch 0, Iteration 8030, Loss: 5.0537, Loss AR: 4.5449, Loss CFM: 0.5088, Grad Norm: 4.3997, LR: 0.000019
Epoch 0, Iteration 8040, Loss: 5.1333, Loss AR: 4.6532, Loss CFM: 0.4802, Grad Norm: 3.6685, LR: 0.000019
Epoch 0, Iteration 8050, Loss: 5.0306, Loss AR: 4.4507, Loss CFM: 0.5800, Grad Norm: 2.7322, LR: 0.000019
Epoch 0, Iteration 8060, Loss: 4.9285, Loss AR: 4.4195, Loss CFM: 0.5091, Grad Norm: 3.7121, LR: 0.000019
Epoch 0, Iteration 8070, Loss: 5.1242, Loss AR: 4.5482, Loss CFM: 0.5760, Grad Norm: 4.2387, LR: 0.000019
Epoch 0, Iteration 8080, Loss: 5.4241, Loss AR: 4.9028, Loss CFM: 0.5213, Grad Norm: 4.3645, LR: 0.000019
Epoch 0, Iteration 8090, Loss: 5.1514, Loss AR: 4.6149, Loss CFM: 0.5365, Grad Norm: 6.3928, LR: 0.000019
Epoch 0, Iteration 8100, Loss: 5.1052, Loss AR: 4.6582, Loss CFM: 0.4470, Grad Norm: 4.6396, LR: 0.000019
Epoch 0, Iteration 8110, Loss: 5.1993, Loss AR: 4.6471, Loss CFM: 0.5523, Grad Norm: 3.8076, LR: 0.000019
Epoch 0, Iteration 8120, Loss: 5.1183, Loss AR: 4.5256, Loss CFM: 0.5927, Grad Norm: 4.7997, LR: 0.000019
Epoch 0, Iteration 8130, Loss: 4.9045, Loss AR: 4.3614, Loss CFM: 0.5432, Grad Norm: 5.3004, LR: 0.000019
Epoch 0, Iteration 8140, Loss: 4.8548, Loss AR: 4.3491, Loss CFM: 0.5058, Grad Norm: 3.6650, LR: 0.000019
Epoch 0, Iteration 8150, Loss: 5.1432, Loss AR: 4.6377, Loss CFM: 0.5055, Grad Norm: 4.1134, LR: 0.000019
Epoch 0, Iteration 8160, Loss: 5.3245, Loss AR: 4.7385, Loss CFM: 0.5860, Grad Norm: 5.9359, LR: 0.000019
Epoch 0, Iteration 8170, Loss: 5.5555, Loss AR: 5.0458, Loss CFM: 0.5096, Grad Norm: 4.1274, LR: 0.000019
Epoch 0, Iteration 8180, Loss: 5.3819, Loss AR: 4.9099, Loss CFM: 0.4720, Grad Norm: 4.6188, LR: 0.000019
Epoch 0, Iteration 8190, Loss: 5.2198, Loss AR: 4.7139, Loss CFM: 0.5059, Grad Norm: 4.1565, LR: 0.000019
Epoch 0, Iteration 8200, Loss: 5.1872, Loss AR: 4.6441, Loss CFM: 0.5431, Grad Norm: 4.8066, LR: 0.000019
Epoch 0, Iteration 8210, Loss: 5.0097, Loss AR: 4.4234, Loss CFM: 0.5863, Grad Norm: 3.5856, LR: 0.000019
Epoch 0, Iteration 8220, Loss: 5.1224, Loss AR: 4.6145, Loss CFM: 0.5080, Grad Norm: 5.5929, LR: 0.000019
Epoch 0, Iteration 8230, Loss: 5.2144, Loss AR: 4.6402, Loss CFM: 0.5742, Grad Norm: 4.0247, LR: 0.000019
Epoch 0, Iteration 8240, Loss: 5.0740, Loss AR: 4.5213, Loss CFM: 0.5526, Grad Norm: 3.7141, LR: 0.000019
Epoch 0, Iteration 8250, Loss: 5.2618, Loss AR: 4.7365, Loss CFM: 0.5253, Grad Norm: 4.0222, LR: 0.000019
Epoch 0, Iteration 8260, Loss: 5.2101, Loss AR: 4.6449, Loss CFM: 0.5651, Grad Norm: 6.4962, LR: 0.000019
Epoch 0, Iteration 8270, Loss: 5.1325, Loss AR: 4.5882, Loss CFM: 0.5443, Grad Norm: 4.6104, LR: 0.000019
Epoch 0, Iteration 8280, Loss: 5.1357, Loss AR: 4.5608, Loss CFM: 0.5750, Grad Norm: 5.4482, LR: 0.000019
Epoch 0, Iteration 8290, Loss: 5.0287, Loss AR: 4.4388, Loss CFM: 0.5899, Grad Norm: 2.8710, LR: 0.000019
Epoch 0, Iteration 8300, Loss: 5.4804, Loss AR: 4.8692, Loss CFM: 0.6113, Grad Norm: 3.4951, LR: 0.000019
Epoch 0, Iteration 8310, Loss: 4.9992, Loss AR: 4.4560, Loss CFM: 0.5432, Grad Norm: 4.1166, LR: 0.000019
Epoch 0, Iteration 8320, Loss: 5.3070, Loss AR: 4.7183, Loss CFM: 0.5886, Grad Norm: 4.9231, LR: 0.000019
Epoch 0, Iteration 8330, Loss: 5.4656, Loss AR: 4.8889, Loss CFM: 0.5767, Grad Norm: 6.0635, LR: 0.000019
Epoch 0, Iteration 8340, Loss: 5.0419, Loss AR: 4.5165, Loss CFM: 0.5254, Grad Norm: 3.6618, LR: 0.000019
Epoch 0, Iteration 8350, Loss: 4.9657, Loss AR: 4.4162, Loss CFM: 0.5495, Grad Norm: 4.0540, LR: 0.000019
Epoch 0, Iteration 8360, Loss: 5.4823, Loss AR: 4.9157, Loss CFM: 0.5666, Grad Norm: 4.4835, LR: 0.000019
Epoch 0, Iteration 8370, Loss: 5.0507, Loss AR: 4.5294, Loss CFM: 0.5213, Grad Norm: 3.1323, LR: 0.000019
Epoch 0, Iteration 8380, Loss: 5.0921, Loss AR: 4.5525, Loss CFM: 0.5396, Grad Norm: 3.3535, LR: 0.000019
Epoch 0, Iteration 8390, Loss: 5.3914, Loss AR: 4.8348, Loss CFM: 0.5565, Grad Norm: 3.6830, LR: 0.000019
Epoch 0, Iteration 8400, Loss: 5.1545, Loss AR: 4.6171, Loss CFM: 0.5374, Grad Norm: 4.6319, LR: 0.000019
Epoch 0, Iteration 8410, Loss: 5.0103, Loss AR: 4.4677, Loss CFM: 0.5426, Grad Norm: 3.6851, LR: 0.000019
Epoch 0, Iteration 8420, Loss: 5.0642, Loss AR: 4.4340, Loss CFM: 0.6301, Grad Norm: 3.3363, LR: 0.000019
Epoch 0, Iteration 8430, Loss: 4.9872, Loss AR: 4.4312, Loss CFM: 0.5560, Grad Norm: 4.2758, LR: 0.000019
Epoch 0, Iteration 8440, Loss: 5.0340, Loss AR: 4.5321, Loss CFM: 0.5018, Grad Norm: 4.4318, LR: 0.000019
Epoch 0, Iteration 8450, Loss: 5.1921, Loss AR: 4.6346, Loss CFM: 0.5576, Grad Norm: 3.8461, LR: 0.000019
Epoch 0, Iteration 8460, Loss: 4.9803, Loss AR: 4.4172, Loss CFM: 0.5631, Grad Norm: 4.2244, LR: 0.000019
Epoch 0, Iteration 8470, Loss: 5.1554, Loss AR: 4.6243, Loss CFM: 0.5311, Grad Norm: 4.1419, LR: 0.000019
Epoch 0, Iteration 8480, Loss: 5.2941, Loss AR: 4.7132, Loss CFM: 0.5809, Grad Norm: 4.3743, LR: 0.000019
Epoch 0, Iteration 8490, Loss: 5.1479, Loss AR: 4.5878, Loss CFM: 0.5601, Grad Norm: 3.8086, LR: 0.000019
Epoch 0, Iteration 8500, Loss: 5.2544, Loss AR: 4.7194, Loss CFM: 0.5350, Grad Norm: 4.1684, LR: 0.000019
Epoch 0, Iteration 8510, Loss: 5.1043, Loss AR: 4.5854, Loss CFM: 0.5189, Grad Norm: 4.0190, LR: 0.000019
Epoch 0, Iteration 8520, Loss: 4.9503, Loss AR: 4.4523, Loss CFM: 0.4980, Grad Norm: 4.0585, LR: 0.000019
Epoch 0, Iteration 8530, Loss: 5.2016, Loss AR: 4.6443, Loss CFM: 0.5574, Grad Norm: 5.1224, LR: 0.000019
Epoch 0, Iteration 8540, Loss: 5.1459, Loss AR: 4.5342, Loss CFM: 0.6118, Grad Norm: 3.5641, LR: 0.000019
Epoch 0, Iteration 8550, Loss: 4.9825, Loss AR: 4.4402, Loss CFM: 0.5423, Grad Norm: 3.6233, LR: 0.000019
Epoch 0, Iteration 8560, Loss: 5.2211, Loss AR: 4.6452, Loss CFM: 0.5759, Grad Norm: 3.0680, LR: 0.000019
Epoch 0, Iteration 8570, Loss: 5.0941, Loss AR: 4.5631, Loss CFM: 0.5310, Grad Norm: 4.0015, LR: 0.000019
Epoch 0, Iteration 8580, Loss: 5.0692, Loss AR: 4.5080, Loss CFM: 0.5613, Grad Norm: 2.9832, LR: 0.000019
Epoch 0, Iteration 8590, Loss: 5.0961, Loss AR: 4.5602, Loss CFM: 0.5359, Grad Norm: 3.7249, LR: 0.000019
Epoch 0, Iteration 8600, Loss: 5.1227, Loss AR: 4.6238, Loss CFM: 0.4989, Grad Norm: 6.1718, LR: 0.000019
Epoch 0, Iteration 8610, Loss: 5.1107, Loss AR: 4.5678, Loss CFM: 0.5429, Grad Norm: 3.7641, LR: 0.000019
Epoch 0, Iteration 8620, Loss: 4.9275, Loss AR: 4.4124, Loss CFM: 0.5151, Grad Norm: 3.6870, LR: 0.000019
Epoch 0, Iteration 8630, Loss: 5.0989, Loss AR: 4.5410, Loss CFM: 0.5579, Grad Norm: 4.3269, LR: 0.000019
Epoch 0, Iteration 8640, Loss: 5.3797, Loss AR: 4.8478, Loss CFM: 0.5320, Grad Norm: 7.6298, LR: 0.000019
Epoch 0, Iteration 8650, Loss: 5.2566, Loss AR: 4.7187, Loss CFM: 0.5379, Grad Norm: 3.8609, LR: 0.000019
Epoch 0, Iteration 8660, Loss: 5.2123, Loss AR: 4.6425, Loss CFM: 0.5698, Grad Norm: 4.5042, LR: 0.000019
Epoch 0, Iteration 8670, Loss: 5.1170, Loss AR: 4.5921, Loss CFM: 0.5248, Grad Norm: 3.3898, LR: 0.000019
Epoch 0, Iteration 8680, Loss: 5.2944, Loss AR: 4.7475, Loss CFM: 0.5469, Grad Norm: 3.7086, LR: 0.000019
Epoch 0, Iteration 8690, Loss: 5.0825, Loss AR: 4.5571, Loss CFM: 0.5254, Grad Norm: 3.7120, LR: 0.000019
Epoch 0, Iteration 8700, Loss: 4.9833, Loss AR: 4.4706, Loss CFM: 0.5128, Grad Norm: 2.8068, LR: 0.000019
Epoch 0, Iteration 8710, Loss: 4.8771, Loss AR: 4.3726, Loss CFM: 0.5046, Grad Norm: 3.1196, LR: 0.000019
Epoch 0, Iteration 8720, Loss: 4.9992, Loss AR: 4.4087, Loss CFM: 0.5905, Grad Norm: 4.4218, LR: 0.000019
Epoch 0, Iteration 8730, Loss: 5.0599, Loss AR: 4.4168, Loss CFM: 0.6431, Grad Norm: 4.0383, LR: 0.000019
Epoch 0, Iteration 8740, Loss: 5.2318, Loss AR: 4.7507, Loss CFM: 0.4810, Grad Norm: 3.4744, LR: 0.000019
Epoch 0, Iteration 8750, Loss: 5.2748, Loss AR: 4.7492, Loss CFM: 0.5256, Grad Norm: 4.0690, LR: 0.000019
Epoch 0, Iteration 8760, Loss: 5.2067, Loss AR: 4.7181, Loss CFM: 0.4886, Grad Norm: 4.0355, LR: 0.000019
Epoch 0, Iteration 8770, Loss: 5.1947, Loss AR: 4.5469, Loss CFM: 0.6478, Grad Norm: 4.1184, LR: 0.000019
Epoch 0, Iteration 8780, Loss: 4.8427, Loss AR: 4.3101, Loss CFM: 0.5325, Grad Norm: 3.7911, LR: 0.000019
Epoch 0, Iteration 8790, Loss: 5.3620, Loss AR: 4.8543, Loss CFM: 0.5077, Grad Norm: 3.6426, LR: 0.000019
Epoch 0, Iteration 8800, Loss: 5.0030, Loss AR: 4.4571, Loss CFM: 0.5459, Grad Norm: 3.8912, LR: 0.000019
Epoch 0, Iteration 8810, Loss: 5.2141, Loss AR: 4.6538, Loss CFM: 0.5602, Grad Norm: 4.8663, LR: 0.000019
Epoch 0, Iteration 8820, Loss: 5.1964, Loss AR: 4.6479, Loss CFM: 0.5486, Grad Norm: 3.8640, LR: 0.000019
Epoch 0, Iteration 8830, Loss: 5.2264, Loss AR: 4.6721, Loss CFM: 0.5544, Grad Norm: 3.7033, LR: 0.000019
Epoch 0, Iteration 8840, Loss: 5.0648, Loss AR: 4.5459, Loss CFM: 0.5190, Grad Norm: 4.4491, LR: 0.000019
Epoch 0, Iteration 8850, Loss: 5.2806, Loss AR: 4.6820, Loss CFM: 0.5986, Grad Norm: 3.8086, LR: 0.000019
Epoch 0, Iteration 8860, Loss: 5.4980, Loss AR: 4.9633, Loss CFM: 0.5347, Grad Norm: 3.5825, LR: 0.000019
Epoch 0, Iteration 8870, Loss: 5.0595, Loss AR: 4.5059, Loss CFM: 0.5536, Grad Norm: 3.4863, LR: 0.000019
Epoch 0, Iteration 8880, Loss: 5.0730, Loss AR: 4.5435, Loss CFM: 0.5296, Grad Norm: 3.0450, LR: 0.000019
Epoch 0, Iteration 8890, Loss: 5.2469, Loss AR: 4.7016, Loss CFM: 0.5453, Grad Norm: 5.7317, LR: 0.000019
Epoch 0, Iteration 8900, Loss: 5.4471, Loss AR: 4.7240, Loss CFM: 0.7231, Grad Norm: 5.9444, LR: 0.000019
Epoch 0, Iteration 8910, Loss: 5.0013, Loss AR: 4.4366, Loss CFM: 0.5647, Grad Norm: 4.4130, LR: 0.000019
Epoch 0, Iteration 8920, Loss: 5.1830, Loss AR: 4.6990, Loss CFM: 0.4839, Grad Norm: 3.8879, LR: 0.000019
Epoch 0, Iteration 8930, Loss: 5.1167, Loss AR: 4.5898, Loss CFM: 0.5269, Grad Norm: 3.5874, LR: 0.000019
Epoch 0, Iteration 8940, Loss: 5.3067, Loss AR: 4.7100, Loss CFM: 0.5967, Grad Norm: 5.4542, LR: 0.000019
Epoch 0, Iteration 8950, Loss: 5.0926, Loss AR: 4.5390, Loss CFM: 0.5536, Grad Norm: 4.8554, LR: 0.000019
Epoch 0, Iteration 8960, Loss: 5.2778, Loss AR: 4.7081, Loss CFM: 0.5697, Grad Norm: 4.0539, LR: 0.000019
Epoch 0, Iteration 8970, Loss: 5.3481, Loss AR: 4.7476, Loss CFM: 0.6005, Grad Norm: 4.5955, LR: 0.000019
Epoch 0, Iteration 8980, Loss: 5.0250, Loss AR: 4.4812, Loss CFM: 0.5438, Grad Norm: 4.2899, LR: 0.000019
Epoch 0, Iteration 8990, Loss: 4.8163, Loss AR: 4.3426, Loss CFM: 0.4738, Grad Norm: 4.0851, LR: 0.000019
Epoch 0, Iteration 9000, Loss: 5.1026, Loss AR: 4.5218, Loss CFM: 0.5808, Grad Norm: 3.3052, LR: 0.000019
Epoch 0, Iteration 9010, Loss: 5.2021, Loss AR: 4.6753, Loss CFM: 0.5269, Grad Norm: 4.1425, LR: 0.000019
Epoch 0, Iteration 9020, Loss: 5.2325, Loss AR: 4.6430, Loss CFM: 0.5895, Grad Norm: 4.6429, LR: 0.000019
Epoch 0, Iteration 9030, Loss: 5.2400, Loss AR: 4.7016, Loss CFM: 0.5385, Grad Norm: 4.5424, LR: 0.000019
Epoch 0, Iteration 9040, Loss: 5.2433, Loss AR: 4.6798, Loss CFM: 0.5635, Grad Norm: 3.9805, LR: 0.000019
Epoch 0, Iteration 9050, Loss: 5.0172, Loss AR: 4.4604, Loss CFM: 0.5568, Grad Norm: 5.2644, LR: 0.000019
Epoch 0, Iteration 9060, Loss: 5.2907, Loss AR: 4.7271, Loss CFM: 0.5636, Grad Norm: 3.7494, LR: 0.000019
Epoch 0, Iteration 9070, Loss: 4.9276, Loss AR: 4.4238, Loss CFM: 0.5038, Grad Norm: 3.4954, LR: 0.000019
Epoch 0, Iteration 9080, Loss: 5.0604, Loss AR: 4.4988, Loss CFM: 0.5616, Grad Norm: 5.5392, LR: 0.000019
Epoch 0, Iteration 9090, Loss: 5.0727, Loss AR: 4.5638, Loss CFM: 0.5089, Grad Norm: 4.2462, LR: 0.000019
Epoch 0, Iteration 9100, Loss: 4.9267, Loss AR: 4.4012, Loss CFM: 0.5255, Grad Norm: 3.2053, LR: 0.000019
Epoch 0, Iteration 9110, Loss: 5.0888, Loss AR: 4.4573, Loss CFM: 0.6315, Grad Norm: 4.4415, LR: 0.000019
Epoch 0, Iteration 9120, Loss: 4.9572, Loss AR: 4.4214, Loss CFM: 0.5359, Grad Norm: 3.6981, LR: 0.000019
Epoch 0, Iteration 9130, Loss: 5.4042, Loss AR: 4.8020, Loss CFM: 0.6023, Grad Norm: 3.9707, LR: 0.000019
Epoch 0, Iteration 9140, Loss: 4.8780, Loss AR: 4.2514, Loss CFM: 0.6266, Grad Norm: 4.5228, LR: 0.000019
Epoch 0, Iteration 9150, Loss: 4.9227, Loss AR: 4.3786, Loss CFM: 0.5440, Grad Norm: 4.4857, LR: 0.000019
Epoch 0, Iteration 9160, Loss: 5.2681, Loss AR: 4.7898, Loss CFM: 0.4783, Grad Norm: 3.7734, LR: 0.000019
Epoch 0, Iteration 9170, Loss: 5.1573, Loss AR: 4.5809, Loss CFM: 0.5763, Grad Norm: 2.9715, LR: 0.000019
Epoch 0, Iteration 9180, Loss: 4.9867, Loss AR: 4.4124, Loss CFM: 0.5744, Grad Norm: 4.2359, LR: 0.000019
Epoch 0, Iteration 9190, Loss: 5.0301, Loss AR: 4.4260, Loss CFM: 0.6040, Grad Norm: 3.8801, LR: 0.000019
Epoch 0, Iteration 9200, Loss: 5.2291, Loss AR: 4.6989, Loss CFM: 0.5302, Grad Norm: 4.6892, LR: 0.000019
Epoch 0, Iteration 9210, Loss: 5.0671, Loss AR: 4.5810, Loss CFM: 0.4861, Grad Norm: 4.6936, LR: 0.000019
Epoch 0, Iteration 9220, Loss: 5.2832, Loss AR: 4.7617, Loss CFM: 0.5215, Grad Norm: 5.2182, LR: 0.000019
Epoch 0, Iteration 9230, Loss: 5.0814, Loss AR: 4.5595, Loss CFM: 0.5220, Grad Norm: 4.6670, LR: 0.000019
Epoch 0, Iteration 9240, Loss: 4.9151, Loss AR: 4.4157, Loss CFM: 0.4994, Grad Norm: 3.6036, LR: 0.000019
Epoch 0, Iteration 9250, Loss: 5.0969, Loss AR: 4.5623, Loss CFM: 0.5345, Grad Norm: 5.9484, LR: 0.000019
Epoch 0, Iteration 9260, Loss: 4.9895, Loss AR: 4.4230, Loss CFM: 0.5665, Grad Norm: 4.6604, LR: 0.000019
Epoch 0, Iteration 9270, Loss: 4.8745, Loss AR: 4.2983, Loss CFM: 0.5761, Grad Norm: 3.4837, LR: 0.000019
Epoch 0, Iteration 9280, Loss: 5.3619, Loss AR: 4.7803, Loss CFM: 0.5817, Grad Norm: 6.1104, LR: 0.000019
Epoch 0, Iteration 9290, Loss: 5.1876, Loss AR: 4.6685, Loss CFM: 0.5191, Grad Norm: 3.8230, LR: 0.000019
Epoch 0, Iteration 9300, Loss: 4.8575, Loss AR: 4.3184, Loss CFM: 0.5391, Grad Norm: 3.0600, LR: 0.000019
Epoch 0, Iteration 9310, Loss: 5.0195, Loss AR: 4.4478, Loss CFM: 0.5718, Grad Norm: 4.9438, LR: 0.000019
Epoch 0, Iteration 9320, Loss: 5.1433, Loss AR: 4.6308, Loss CFM: 0.5125, Grad Norm: 5.0642, LR: 0.000019
Epoch 0, Iteration 9330, Loss: 5.2473, Loss AR: 4.6652, Loss CFM: 0.5821, Grad Norm: 6.2234, LR: 0.000019
Epoch 0, Iteration 9340, Loss: 5.2868, Loss AR: 4.7262, Loss CFM: 0.5606, Grad Norm: 3.5903, LR: 0.000019
Epoch 0, Iteration 9350, Loss: 5.0438, Loss AR: 4.5212, Loss CFM: 0.5225, Grad Norm: 3.8287, LR: 0.000019
Epoch 0, Iteration 9360, Loss: 5.0329, Loss AR: 4.4952, Loss CFM: 0.5377, Grad Norm: 4.5033, LR: 0.000019
Epoch 0, Iteration 9370, Loss: 5.2163, Loss AR: 4.6224, Loss CFM: 0.5939, Grad Norm: 6.0828, LR: 0.000019
Epoch 0, Iteration 9380, Loss: 5.0780, Loss AR: 4.4886, Loss CFM: 0.5894, Grad Norm: 3.2801, LR: 0.000019
Epoch 0, Iteration 9390, Loss: 5.1168, Loss AR: 4.5282, Loss CFM: 0.5887, Grad Norm: 3.8551, LR: 0.000019
Epoch 0, Iteration 9400, Loss: 5.3243, Loss AR: 4.7081, Loss CFM: 0.6162, Grad Norm: 3.6619, LR: 0.000019
Epoch 0, Iteration 9410, Loss: 4.9536, Loss AR: 4.4274, Loss CFM: 0.5263, Grad Norm: 3.9153, LR: 0.000019
Epoch 0, Iteration 9420, Loss: 5.1434, Loss AR: 4.6234, Loss CFM: 0.5199, Grad Norm: 3.1893, LR: 0.000019
Epoch 0, Iteration 9430, Loss: 5.2341, Loss AR: 4.7199, Loss CFM: 0.5142, Grad Norm: 4.1828, LR: 0.000019
Epoch 0, Iteration 9440, Loss: 4.9035, Loss AR: 4.3154, Loss CFM: 0.5881, Grad Norm: 4.9150, LR: 0.000019
Epoch 0, Iteration 9450, Loss: 5.3144, Loss AR: 4.6942, Loss CFM: 0.6202, Grad Norm: 4.5790, LR: 0.000019
Epoch 0, Iteration 9460, Loss: 4.9949, Loss AR: 4.4608, Loss CFM: 0.5341, Grad Norm: 4.2382, LR: 0.000019
Epoch 0, Iteration 9470, Loss: 5.0181, Loss AR: 4.4938, Loss CFM: 0.5243, Grad Norm: 4.4505, LR: 0.000019
Epoch 0, Iteration 9480, Loss: 5.2495, Loss AR: 4.7322, Loss CFM: 0.5173, Grad Norm: 4.0771, LR: 0.000019
Epoch 0, Iteration 9490, Loss: 5.1193, Loss AR: 4.6055, Loss CFM: 0.5138, Grad Norm: 5.0158, LR: 0.000019
Epoch 0, Iteration 9500, Loss: 5.2028, Loss AR: 4.6505, Loss CFM: 0.5523, Grad Norm: 4.0201, LR: 0.000019
Epoch 0, Iteration 9510, Loss: 5.1763, Loss AR: 4.6163, Loss CFM: 0.5600, Grad Norm: 4.2800, LR: 0.000019
Epoch 0, Iteration 9520, Loss: 5.2990, Loss AR: 4.8019, Loss CFM: 0.4970, Grad Norm: 3.4066, LR: 0.000019
Epoch 0, Iteration 9530, Loss: 4.9475, Loss AR: 4.4120, Loss CFM: 0.5355, Grad Norm: 4.9376, LR: 0.000019
Epoch 0, Iteration 9540, Loss: 5.4257, Loss AR: 4.9713, Loss CFM: 0.4544, Grad Norm: 5.5907, LR: 0.000019
Epoch 0, Iteration 9550, Loss: 5.1620, Loss AR: 4.6027, Loss CFM: 0.5593, Grad Norm: 5.2068, LR: 0.000019
Epoch 0, Iteration 9560, Loss: 5.1963, Loss AR: 4.6822, Loss CFM: 0.5141, Grad Norm: 4.3019, LR: 0.000019
Epoch 0, Iteration 9570, Loss: 5.2671, Loss AR: 4.6546, Loss CFM: 0.6125, Grad Norm: 4.5987, LR: 0.000019
Epoch 0, Iteration 9580, Loss: 5.1466, Loss AR: 4.5562, Loss CFM: 0.5905, Grad Norm: 4.9397, LR: 0.000019
Epoch 0, Iteration 9590, Loss: 5.2730, Loss AR: 4.7145, Loss CFM: 0.5585, Grad Norm: 3.9415, LR: 0.000019
Epoch 0, Iteration 9600, Loss: 5.0698, Loss AR: 4.5577, Loss CFM: 0.5121, Grad Norm: 4.2057, LR: 0.000019
Epoch 0, Iteration 9610, Loss: 5.0994, Loss AR: 4.6064, Loss CFM: 0.4931, Grad Norm: 4.4601, LR: 0.000019
Epoch 0, Iteration 9620, Loss: 4.8899, Loss AR: 4.3535, Loss CFM: 0.5364, Grad Norm: 4.1597, LR: 0.000019
Epoch 0, Iteration 9630, Loss: 5.0474, Loss AR: 4.4556, Loss CFM: 0.5918, Grad Norm: 2.9449, LR: 0.000019
Epoch 0, Iteration 9640, Loss: 5.1244, Loss AR: 4.5050, Loss CFM: 0.6194, Grad Norm: 5.5765, LR: 0.000019
Epoch 0, Iteration 9650, Loss: 5.1532, Loss AR: 4.6233, Loss CFM: 0.5299, Grad Norm: 3.8447, LR: 0.000019
Epoch 0, Iteration 9660, Loss: 5.0355, Loss AR: 4.5050, Loss CFM: 0.5305, Grad Norm: 3.3963, LR: 0.000019
Epoch 0, Iteration 9670, Loss: 5.1158, Loss AR: 4.5131, Loss CFM: 0.6027, Grad Norm: 4.7885, LR: 0.000019
Epoch 0, Iteration 9680, Loss: 4.9715, Loss AR: 4.4566, Loss CFM: 0.5149, Grad Norm: 4.5127, LR: 0.000019
Epoch 0, Iteration 9690, Loss: 5.2597, Loss AR: 4.7106, Loss CFM: 0.5491, Grad Norm: 4.1001, LR: 0.000019
Epoch 0, Iteration 9700, Loss: 4.9970, Loss AR: 4.4150, Loss CFM: 0.5820, Grad Norm: 5.1714, LR: 0.000019
Epoch 0, Iteration 9710, Loss: 5.0921, Loss AR: 4.5620, Loss CFM: 0.5301, Grad Norm: 4.2887, LR: 0.000019
Epoch 0, Iteration 9720, Loss: 5.0497, Loss AR: 4.4620, Loss CFM: 0.5877, Grad Norm: 2.8584, LR: 0.000019
Epoch 0, Iteration 9730, Loss: 5.0353, Loss AR: 4.4511, Loss CFM: 0.5842, Grad Norm: 4.6989, LR: 0.000019
Epoch 0, Iteration 9740, Loss: 5.1493, Loss AR: 4.5958, Loss CFM: 0.5535, Grad Norm: 3.4001, LR: 0.000019
Epoch 0, Iteration 9750, Loss: 4.9995, Loss AR: 4.5370, Loss CFM: 0.4625, Grad Norm: 4.9046, LR: 0.000019
Epoch 0, Iteration 9760, Loss: 5.3923, Loss AR: 4.8298, Loss CFM: 0.5626, Grad Norm: 4.2294, LR: 0.000019
Epoch 0, Iteration 9770, Loss: 5.0977, Loss AR: 4.5315, Loss CFM: 0.5662, Grad Norm: 5.4464, LR: 0.000019
Epoch 0, Iteration 9780, Loss: 5.1051, Loss AR: 4.5253, Loss CFM: 0.5798, Grad Norm: 4.7152, LR: 0.000019
Epoch 0, Iteration 9790, Loss: 5.1238, Loss AR: 4.5861, Loss CFM: 0.5377, Grad Norm: 3.4983, LR: 0.000019
Epoch 0, Iteration 9800, Loss: 4.8312, Loss AR: 4.3192, Loss CFM: 0.5120, Grad Norm: 4.3716, LR: 0.000019
Epoch 0, Iteration 9810, Loss: 5.0496, Loss AR: 4.5483, Loss CFM: 0.5013, Grad Norm: 3.6779, LR: 0.000019
Epoch 0, Iteration 9820, Loss: 5.2004, Loss AR: 4.6503, Loss CFM: 0.5501, Grad Norm: 4.5903, LR: 0.000019
Epoch 0, Iteration 9830, Loss: 4.9767, Loss AR: 4.4366, Loss CFM: 0.5401, Grad Norm: 4.7515, LR: 0.000019
Epoch 0, Iteration 9840, Loss: 5.1270, Loss AR: 4.6274, Loss CFM: 0.4997, Grad Norm: 3.3361, LR: 0.000019
Epoch 0, Iteration 9850, Loss: 5.1854, Loss AR: 4.6422, Loss CFM: 0.5432, Grad Norm: 4.2699, LR: 0.000019
Epoch 0, Iteration 9860, Loss: 5.1563, Loss AR: 4.5116, Loss CFM: 0.6447, Grad Norm: 3.8856, LR: 0.000019
Epoch 0, Iteration 9870, Loss: 5.0107, Loss AR: 4.5239, Loss CFM: 0.4868, Grad Norm: 5.6834, LR: 0.000019
Epoch 0, Iteration 9880, Loss: 5.0300, Loss AR: 4.4761, Loss CFM: 0.5540, Grad Norm: 3.2291, LR: 0.000019
Epoch 0, Iteration 9890, Loss: 5.3280, Loss AR: 4.3467, Loss CFM: 0.9813, Grad Norm: 4.2232, LR: 0.000019
Epoch 0, Iteration 9900, Loss: 4.9683, Loss AR: 4.4673, Loss CFM: 0.5010, Grad Norm: 4.6058, LR: 0.000019
Epoch 0, Iteration 9910, Loss: 5.1624, Loss AR: 4.6354, Loss CFM: 0.5270, Grad Norm: 4.0625, LR: 0.000019
Epoch 0, Iteration 9920, Loss: 5.0466, Loss AR: 4.4735, Loss CFM: 0.5732, Grad Norm: 4.7137, LR: 0.000019
Epoch 0, Iteration 9930, Loss: 4.9199, Loss AR: 4.3761, Loss CFM: 0.5437, Grad Norm: 3.0034, LR: 0.000019
Epoch 0, Iteration 9940, Loss: 4.9337, Loss AR: 4.3593, Loss CFM: 0.5744, Grad Norm: 3.7182, LR: 0.000019
Epoch 0, Iteration 9950, Loss: 5.1611, Loss AR: 4.6175, Loss CFM: 0.5437, Grad Norm: 4.4718, LR: 0.000019
Epoch 0, Iteration 9960, Loss: 5.1316, Loss AR: 4.6471, Loss CFM: 0.4844, Grad Norm: 4.0198, LR: 0.000019
Epoch 0, Iteration 9970, Loss: 4.9117, Loss AR: 4.4131, Loss CFM: 0.4986, Grad Norm: 3.0879, LR: 0.000019
Epoch 0, Iteration 9980, Loss: 5.3387, Loss AR: 4.7935, Loss CFM: 0.5452, Grad Norm: 3.8865, LR: 0.000019
Epoch 0, Iteration 9990, Loss: 5.2212, Loss AR: 4.6677, Loss CFM: 0.5536, Grad Norm: 3.8165, LR: 0.000019
Epoch 0, Iteration 10000, Loss: 4.9664, Loss AR: 4.4062, Loss CFM: 0.5602, Grad Norm: 4.9993, LR: 0.000019
Epoch 0, Iteration 10010, Loss: 5.3268, Loss AR: 4.7173, Loss CFM: 0.6095, Grad Norm: 5.3263, LR: 0.000019
Epoch 0, Iteration 10020, Loss: 5.0990, Loss AR: 4.6574, Loss CFM: 0.4416, Grad Norm: 3.6618, LR: 0.000019
Epoch 0, Iteration 10030, Loss: 5.0635, Loss AR: 4.5636, Loss CFM: 0.4999, Grad Norm: 4.3102, LR: 0.000019
Epoch 0, Iteration 10040, Loss: 5.1115, Loss AR: 4.5796, Loss CFM: 0.5318, Grad Norm: 4.1808, LR: 0.000019
Epoch 0, Iteration 10050, Loss: 5.3063, Loss AR: 4.7845, Loss CFM: 0.5218, Grad Norm: 3.4524, LR: 0.000019
Epoch 0, Iteration 10060, Loss: 4.9136, Loss AR: 4.3445, Loss CFM: 0.5691, Grad Norm: 3.1816, LR: 0.000019
Epoch 0, Iteration 10070, Loss: 5.2785, Loss AR: 4.7197, Loss CFM: 0.5588, Grad Norm: 3.8235, LR: 0.000019
Epoch 0, Iteration 10080, Loss: 5.0416, Loss AR: 4.4877, Loss CFM: 0.5539, Grad Norm: 5.0721, LR: 0.000019
Epoch 0, Iteration 10090, Loss: 5.2639, Loss AR: 4.6846, Loss CFM: 0.5793, Grad Norm: 3.7567, LR: 0.000019
Epoch 0, Iteration 10100, Loss: 5.0410, Loss AR: 4.5425, Loss CFM: 0.4985, Grad Norm: 4.3287, LR: 0.000019
Epoch 0, Iteration 10110, Loss: 4.9793, Loss AR: 4.4832, Loss CFM: 0.4961, Grad Norm: 3.2776, LR: 0.000019
Epoch 0, Iteration 10120, Loss: 5.1069, Loss AR: 4.5426, Loss CFM: 0.5643, Grad Norm: 3.8592, LR: 0.000019
Epoch 0, Iteration 10130, Loss: 5.1527, Loss AR: 4.4915, Loss CFM: 0.6612, Grad Norm: 3.7976, LR: 0.000019
Epoch 0, Iteration 10140, Loss: 5.0316, Loss AR: 4.4960, Loss CFM: 0.5356, Grad Norm: 3.1860, LR: 0.000019
Epoch 0, Iteration 10150, Loss: 4.8919, Loss AR: 4.3386, Loss CFM: 0.5533, Grad Norm: 3.7466, LR: 0.000019
Epoch 0, Iteration 10160, Loss: 5.0200, Loss AR: 4.4877, Loss CFM: 0.5323, Grad Norm: 4.7306, LR: 0.000019
Epoch 0, Iteration 10170, Loss: 5.1797, Loss AR: 4.6355, Loss CFM: 0.5442, Grad Norm: 4.6633, LR: 0.000019
Epoch 0, Iteration 10180, Loss: 4.9487, Loss AR: 4.3933, Loss CFM: 0.5554, Grad Norm: 3.4278, LR: 0.000019
Epoch 0, Iteration 10190, Loss: 5.0482, Loss AR: 4.5005, Loss CFM: 0.5477, Grad Norm: 3.8575, LR: 0.000019
Epoch 0, Iteration 10200, Loss: 4.9076, Loss AR: 4.3970, Loss CFM: 0.5105, Grad Norm: 3.7609, LR: 0.000019
Epoch 0, Iteration 10210, Loss: 5.0915, Loss AR: 4.5554, Loss CFM: 0.5361, Grad Norm: 4.0374, LR: 0.000019
Epoch 0, Iteration 10220, Loss: 5.0575, Loss AR: 4.4742, Loss CFM: 0.5833, Grad Norm: 3.4130, LR: 0.000019
Epoch 0, Iteration 10230, Loss: 5.1787, Loss AR: 4.5910, Loss CFM: 0.5877, Grad Norm: 3.4952, LR: 0.000019
Epoch 0, Iteration 10240, Loss: 5.2469, Loss AR: 4.6596, Loss CFM: 0.5874, Grad Norm: 3.8879, LR: 0.000019
Epoch 0, Iteration 10250, Loss: 5.1349, Loss AR: 4.5628, Loss CFM: 0.5721, Grad Norm: 4.9821, LR: 0.000019
Epoch 0, Iteration 10260, Loss: 5.2667, Loss AR: 4.6752, Loss CFM: 0.5915, Grad Norm: 4.5941, LR: 0.000019
Epoch 0, Iteration 10270, Loss: 4.7875, Loss AR: 4.2982, Loss CFM: 0.4893, Grad Norm: 3.6938, LR: 0.000019
Epoch 0, Iteration 10280, Loss: 5.1751, Loss AR: 4.5385, Loss CFM: 0.6366, Grad Norm: 3.2527, LR: 0.000019
Epoch 0, Iteration 10290, Loss: 5.0709, Loss AR: 4.6255, Loss CFM: 0.4454, Grad Norm: 5.6262, LR: 0.000019
Epoch 0, Iteration 10300, Loss: 4.9204, Loss AR: 4.3697, Loss CFM: 0.5507, Grad Norm: 5.1498, LR: 0.000019
Epoch 0, Iteration 10310, Loss: 5.0348, Loss AR: 4.5376, Loss CFM: 0.4972, Grad Norm: 4.2863, LR: 0.000019
Epoch 0, Iteration 10320, Loss: 5.2340, Loss AR: 4.7003, Loss CFM: 0.5337, Grad Norm: 4.4829, LR: 0.000019
Epoch 0, Iteration 10330, Loss: 5.1050, Loss AR: 4.5200, Loss CFM: 0.5851, Grad Norm: 5.9837, LR: 0.000019
Epoch 0, Iteration 10340, Loss: 5.1625, Loss AR: 4.6489, Loss CFM: 0.5136, Grad Norm: 5.1073, LR: 0.000019
Epoch 0, Iteration 10350, Loss: 5.0350, Loss AR: 4.4444, Loss CFM: 0.5906, Grad Norm: 2.8420, LR: 0.000019
Epoch 0, Iteration 10360, Loss: 5.5076, Loss AR: 4.9676, Loss CFM: 0.5399, Grad Norm: 5.8206, LR: 0.000019
Epoch 0, Iteration 10370, Loss: 5.0597, Loss AR: 4.5711, Loss CFM: 0.4886, Grad Norm: 4.1574, LR: 0.000019
Epoch 0, Iteration 10380, Loss: 5.2570, Loss AR: 4.6852, Loss CFM: 0.5719, Grad Norm: 4.1859, LR: 0.000019
Epoch 0, Iteration 10390, Loss: 5.2458, Loss AR: 4.6601, Loss CFM: 0.5856, Grad Norm: 8.0395, LR: 0.000019
Epoch 0, Iteration 10400, Loss: 5.2544, Loss AR: 4.6310, Loss CFM: 0.6234, Grad Norm: 3.5698, LR: 0.000019
Epoch 0, Iteration 10410, Loss: 5.0814, Loss AR: 4.5330, Loss CFM: 0.5484, Grad Norm: 3.4296, LR: 0.000019
Epoch 0, Iteration 10420, Loss: 5.0650, Loss AR: 4.5523, Loss CFM: 0.5127, Grad Norm: 5.0735, LR: 0.000019
Epoch 0, Iteration 10430, Loss: 4.9354, Loss AR: 4.4245, Loss CFM: 0.5109, Grad Norm: 4.4959, LR: 0.000019
Epoch 0, Iteration 10440, Loss: 5.2162, Loss AR: 4.7251, Loss CFM: 0.4911, Grad Norm: 3.7938, LR: 0.000019
Epoch 0, Iteration 10450, Loss: 5.1040, Loss AR: 4.4905, Loss CFM: 0.6134, Grad Norm: 5.4288, LR: 0.000019
Epoch 0, Iteration 10460, Loss: 5.1586, Loss AR: 4.5352, Loss CFM: 0.6234, Grad Norm: 7.7918, LR: 0.000019
Epoch 0, Iteration 10470, Loss: 5.1349, Loss AR: 4.5429, Loss CFM: 0.5920, Grad Norm: 4.3352, LR: 0.000019
Epoch 0, Iteration 10480, Loss: 4.9135, Loss AR: 4.3664, Loss CFM: 0.5471, Grad Norm: 4.2281, LR: 0.000019
Epoch 0, Iteration 10490, Loss: 4.9861, Loss AR: 4.4234, Loss CFM: 0.5627, Grad Norm: 3.3823, LR: 0.000019
Epoch 0, Iteration 10500, Loss: 4.9622, Loss AR: 4.4249, Loss CFM: 0.5372, Grad Norm: 4.7497, LR: 0.000019
Epoch 0, Iteration 10510, Loss: 4.9874, Loss AR: 4.4280, Loss CFM: 0.5594, Grad Norm: 3.9003, LR: 0.000019
Epoch 0, Iteration 10520, Loss: 5.0160, Loss AR: 4.5150, Loss CFM: 0.5010, Grad Norm: 4.4323, LR: 0.000019
Epoch 0, Iteration 10530, Loss: 5.2385, Loss AR: 4.6823, Loss CFM: 0.5562, Grad Norm: 4.4660, LR: 0.000019
Epoch 0, Iteration 10540, Loss: 5.2214, Loss AR: 4.6419, Loss CFM: 0.5795, Grad Norm: 4.2492, LR: 0.000019
Epoch 0, Iteration 10550, Loss: 5.0374, Loss AR: 4.4760, Loss CFM: 0.5614, Grad Norm: 5.6181, LR: 0.000019
Epoch 0, Iteration 10560, Loss: 5.0511, Loss AR: 4.5157, Loss CFM: 0.5354, Grad Norm: 4.2187, LR: 0.000019
Epoch 0, Iteration 10570, Loss: 5.1901, Loss AR: 4.6131, Loss CFM: 0.5770, Grad Norm: 5.8986, LR: 0.000019
Epoch 0, Iteration 10580, Loss: 5.2016, Loss AR: 4.6358, Loss CFM: 0.5658, Grad Norm: 3.9928, LR: 0.000019
Epoch 0, Iteration 10590, Loss: 4.9798, Loss AR: 4.4236, Loss CFM: 0.5563, Grad Norm: 3.5554, LR: 0.000019
Epoch 0, Iteration 10600, Loss: 4.9048, Loss AR: 4.2758, Loss CFM: 0.6290, Grad Norm: 4.0779, LR: 0.000019
Epoch 0, Iteration 10610, Loss: 5.2951, Loss AR: 4.7348, Loss CFM: 0.5603, Grad Norm: 4.4714, LR: 0.000019
Epoch 0, Iteration 10620, Loss: 4.9915, Loss AR: 4.4767, Loss CFM: 0.5148, Grad Norm: 3.9133, LR: 0.000019
Epoch 0, Iteration 10630, Loss: 5.2065, Loss AR: 4.6409, Loss CFM: 0.5656, Grad Norm: 3.9325, LR: 0.000019
Epoch 0, Iteration 10640, Loss: 5.0277, Loss AR: 4.4689, Loss CFM: 0.5588, Grad Norm: 4.1493, LR: 0.000019
Epoch 0, Iteration 10650, Loss: 5.1288, Loss AR: 4.5300, Loss CFM: 0.5988, Grad Norm: 4.5187, LR: 0.000019
Epoch 0, Iteration 10660, Loss: 5.1391, Loss AR: 4.6397, Loss CFM: 0.4994, Grad Norm: 4.1708, LR: 0.000019
Epoch 0, Iteration 10670, Loss: 5.1353, Loss AR: 4.6117, Loss CFM: 0.5235, Grad Norm: 3.2782, LR: 0.000019
Epoch 0, Iteration 10680, Loss: 5.0854, Loss AR: 4.4256, Loss CFM: 0.6598, Grad Norm: 5.8631, LR: 0.000019
Epoch 0, Iteration 10690, Loss: 5.3030, Loss AR: 4.7520, Loss CFM: 0.5510, Grad Norm: 3.5833, LR: 0.000019
Epoch 0, Iteration 10700, Loss: 4.9568, Loss AR: 4.4526, Loss CFM: 0.5042, Grad Norm: 4.9215, LR: 0.000019
Epoch 0, Iteration 10710, Loss: 5.1068, Loss AR: 4.5465, Loss CFM: 0.5603, Grad Norm: 3.8301, LR: 0.000019
Epoch 0, Iteration 10720, Loss: 5.1172, Loss AR: 4.5573, Loss CFM: 0.5598, Grad Norm: 4.4904, LR: 0.000019
Epoch 0, Iteration 10730, Loss: 5.1906, Loss AR: 4.6744, Loss CFM: 0.5162, Grad Norm: 4.0572, LR: 0.000019
Epoch 0, Iteration 10740, Loss: 5.1207, Loss AR: 4.4877, Loss CFM: 0.6329, Grad Norm: 4.7884, LR: 0.000019
Epoch 0, Iteration 10750, Loss: 4.9390, Loss AR: 4.3618, Loss CFM: 0.5772, Grad Norm: 3.5917, LR: 0.000019
Epoch 0, Iteration 10760, Loss: 5.0022, Loss AR: 4.4433, Loss CFM: 0.5588, Grad Norm: 3.6366, LR: 0.000019
Epoch 0, Iteration 10770, Loss: 5.3774, Loss AR: 4.7839, Loss CFM: 0.5935, Grad Norm: 4.9069, LR: 0.000019
Epoch 0, Iteration 10780, Loss: 5.3343, Loss AR: 4.7506, Loss CFM: 0.5838, Grad Norm: 4.0711, LR: 0.000019
Epoch 0, Iteration 10790, Loss: 4.9661, Loss AR: 4.5183, Loss CFM: 0.4478, Grad Norm: 5.5807, LR: 0.000019
Epoch 0, Iteration 10800, Loss: 5.2289, Loss AR: 4.6217, Loss CFM: 0.6072, Grad Norm: 4.5014, LR: 0.000019
Epoch 0, Iteration 10810, Loss: 4.9721, Loss AR: 4.3785, Loss CFM: 0.5936, Grad Norm: 4.3949, LR: 0.000019
Epoch 0, Iteration 10820, Loss: 5.6870, Loss AR: 4.8365, Loss CFM: 0.8506, Grad Norm: 5.9274, LR: 0.000019
Epoch 0, Iteration 10830, Loss: 5.1738, Loss AR: 4.5700, Loss CFM: 0.6038, Grad Norm: 3.7888, LR: 0.000019
Epoch 0, Iteration 10840, Loss: 4.8901, Loss AR: 4.4025, Loss CFM: 0.4875, Grad Norm: 3.6007, LR: 0.000019
Epoch 0, Iteration 10850, Loss: 5.1966, Loss AR: 4.7032, Loss CFM: 0.4935, Grad Norm: 3.9687, LR: 0.000019
Epoch 0, Iteration 10860, Loss: 4.9810, Loss AR: 4.4889, Loss CFM: 0.4921, Grad Norm: 3.7769, LR: 0.000019
Epoch 0, Iteration 10870, Loss: 5.3418, Loss AR: 4.8033, Loss CFM: 0.5385, Grad Norm: 4.2038, LR: 0.000019
Epoch 0, Iteration 10880, Loss: 4.9643, Loss AR: 4.3581, Loss CFM: 0.6063, Grad Norm: 4.9263, LR: 0.000019
Epoch 0, Iteration 10890, Loss: 5.1738, Loss AR: 4.6586, Loss CFM: 0.5152, Grad Norm: 4.9628, LR: 0.000019
Epoch 0, Iteration 10900, Loss: 5.1601, Loss AR: 4.5863, Loss CFM: 0.5738, Grad Norm: 3.9926, LR: 0.000019
Epoch 0, Iteration 10910, Loss: 4.9881, Loss AR: 4.4808, Loss CFM: 0.5073, Grad Norm: 4.0144, LR: 0.000019
Epoch 0, Iteration 10920, Loss: 4.8050, Loss AR: 4.2732, Loss CFM: 0.5318, Grad Norm: 3.9731, LR: 0.000019
Epoch 0, Iteration 10930, Loss: 5.0057, Loss AR: 4.4417, Loss CFM: 0.5640, Grad Norm: 3.9460, LR: 0.000019
Epoch 0, Iteration 10940, Loss: 5.1891, Loss AR: 4.6507, Loss CFM: 0.5384, Grad Norm: 3.7042, LR: 0.000019
Epoch 0, Iteration 10950, Loss: 4.9901, Loss AR: 4.4341, Loss CFM: 0.5560, Grad Norm: 4.8265, LR: 0.000019
Epoch 0, Iteration 10960, Loss: 5.3089, Loss AR: 4.7362, Loss CFM: 0.5727, Grad Norm: 6.3676, LR: 0.000019
Epoch 0, Iteration 10970, Loss: 5.0684, Loss AR: 4.5141, Loss CFM: 0.5543, Grad Norm: 4.2840, LR: 0.000019
Epoch 0, Iteration 10980, Loss: 5.3217, Loss AR: 4.7122, Loss CFM: 0.6095, Grad Norm: 6.5379, LR: 0.000019
Epoch 0, Iteration 10990, Loss: 5.2464, Loss AR: 4.6503, Loss CFM: 0.5961, Grad Norm: 7.6947, LR: 0.000019
Epoch 0, Iteration 11000, Loss: 5.1223, Loss AR: 4.5825, Loss CFM: 0.5398, Grad Norm: 3.5645, LR: 0.000019
Epoch 0, Iteration 11010, Loss: 5.2472, Loss AR: 4.6597, Loss CFM: 0.5875, Grad Norm: 5.0541, LR: 0.000019
Epoch 0, Iteration 11020, Loss: 5.1564, Loss AR: 4.6365, Loss CFM: 0.5199, Grad Norm: 6.0216, LR: 0.000019
Epoch 0, Iteration 11030, Loss: 5.1493, Loss AR: 4.4799, Loss CFM: 0.6694, Grad Norm: 3.9164, LR: 0.000019
Epoch 0, Iteration 11040, Loss: 4.9234, Loss AR: 4.3182, Loss CFM: 0.6053, Grad Norm: 4.6470, LR: 0.000019
Epoch 0, Iteration 11050, Loss: 4.9211, Loss AR: 4.3809, Loss CFM: 0.5402, Grad Norm: 3.7829, LR: 0.000019
Epoch 0, Iteration 11060, Loss: 5.4043, Loss AR: 4.8936, Loss CFM: 0.5106, Grad Norm: 8.8316, LR: 0.000019
Epoch 0, Iteration 11070, Loss: 4.9429, Loss AR: 4.4190, Loss CFM: 0.5239, Grad Norm: 3.4889, LR: 0.000019
Epoch 0, Iteration 11080, Loss: 5.0634, Loss AR: 4.4718, Loss CFM: 0.5915, Grad Norm: 5.2585, LR: 0.000019
Epoch 0, Iteration 11090, Loss: 5.0275, Loss AR: 4.5062, Loss CFM: 0.5213, Grad Norm: 5.6658, LR: 0.000019
Epoch 0, Iteration 11100, Loss: 5.1745, Loss AR: 4.6208, Loss CFM: 0.5537, Grad Norm: 3.9873, LR: 0.000019
Epoch 0, Iteration 11110, Loss: 5.0649, Loss AR: 4.4918, Loss CFM: 0.5730, Grad Norm: 3.7527, LR: 0.000019
Epoch 0, Iteration 11120, Loss: 4.8904, Loss AR: 4.3188, Loss CFM: 0.5716, Grad Norm: 3.0326, LR: 0.000019
Epoch 0, Iteration 11130, Loss: 5.1728, Loss AR: 4.5473, Loss CFM: 0.6255, Grad Norm: 4.2054, LR: 0.000019
Epoch 0, Iteration 11140, Loss: 5.0509, Loss AR: 4.5209, Loss CFM: 0.5300, Grad Norm: 3.4382, LR: 0.000019
Epoch 0, Iteration 11150, Loss: 5.3495, Loss AR: 4.7334, Loss CFM: 0.6161, Grad Norm: 5.2926, LR: 0.000019
Epoch 0, Iteration 11160, Loss: 5.3650, Loss AR: 4.8284, Loss CFM: 0.5366, Grad Norm: 7.7722, LR: 0.000019
Epoch 0, Iteration 11170, Loss: 5.0837, Loss AR: 4.5246, Loss CFM: 0.5590, Grad Norm: 4.3045, LR: 0.000019
Epoch 0, Iteration 11180, Loss: 4.7859, Loss AR: 4.3178, Loss CFM: 0.4680, Grad Norm: 2.8098, LR: 0.000019
Epoch 0, Iteration 11190, Loss: 5.0212, Loss AR: 4.4441, Loss CFM: 0.5772, Grad Norm: 4.5826, LR: 0.000019
Epoch 0, Iteration 11200, Loss: 5.1061, Loss AR: 4.5308, Loss CFM: 0.5753, Grad Norm: 3.4555, LR: 0.000019
Epoch 0, Iteration 11210, Loss: 5.2539, Loss AR: 4.6214, Loss CFM: 0.6325, Grad Norm: 4.3590, LR: 0.000019
Epoch 0, Iteration 11220, Loss: 5.1739, Loss AR: 4.5785, Loss CFM: 0.5954, Grad Norm: 4.2964, LR: 0.000019
Epoch 0, Iteration 11230, Loss: 4.9928, Loss AR: 4.4549, Loss CFM: 0.5379, Grad Norm: 3.4913, LR: 0.000019
Epoch 0, Iteration 11240, Loss: 5.1800, Loss AR: 4.7148, Loss CFM: 0.4651, Grad Norm: 4.2501, LR: 0.000019
Epoch 0, Iteration 11250, Loss: 5.0400, Loss AR: 4.5308, Loss CFM: 0.5092, Grad Norm: 4.3655, LR: 0.000019
Epoch 0, Iteration 11260, Loss: 5.0347, Loss AR: 4.4563, Loss CFM: 0.5783, Grad Norm: 3.8648, LR: 0.000019
Epoch 0, Iteration 11270, Loss: 5.0161, Loss AR: 4.4597, Loss CFM: 0.5564, Grad Norm: 4.0771, LR: 0.000019
Epoch 0, Iteration 11280, Loss: 5.3043, Loss AR: 4.7319, Loss CFM: 0.5724, Grad Norm: 3.8689, LR: 0.000019
Epoch 0, Iteration 11290, Loss: 5.0962, Loss AR: 4.5763, Loss CFM: 0.5199, Grad Norm: 3.4704, LR: 0.000019
Epoch 0, Iteration 11300, Loss: 4.9800, Loss AR: 4.4079, Loss CFM: 0.5721, Grad Norm: 3.9389, LR: 0.000019
Epoch 0, Iteration 11310, Loss: 5.0116, Loss AR: 4.4905, Loss CFM: 0.5211, Grad Norm: 3.9682, LR: 0.000019
Epoch 0, Iteration 11320, Loss: 4.7547, Loss AR: 4.2145, Loss CFM: 0.5402, Grad Norm: 4.0642, LR: 0.000019
Epoch 0, Iteration 11330, Loss: 4.8418, Loss AR: 4.2770, Loss CFM: 0.5648, Grad Norm: 4.1351, LR: 0.000019
Epoch 0, Iteration 11340, Loss: 5.1699, Loss AR: 4.6073, Loss CFM: 0.5626, Grad Norm: 4.1316, LR: 0.000019
Epoch 0, Iteration 11350, Loss: 4.9442, Loss AR: 4.4291, Loss CFM: 0.5151, Grad Norm: 3.8076, LR: 0.000019
Epoch 0, Iteration 11360, Loss: 5.0777, Loss AR: 4.5332, Loss CFM: 0.5445, Grad Norm: 2.9151, LR: 0.000019
Epoch 0, Iteration 11370, Loss: 5.1785, Loss AR: 4.5974, Loss CFM: 0.5811, Grad Norm: 3.9454, LR: 0.000019
Epoch 0, Iteration 11380, Loss: 5.2872, Loss AR: 4.7436, Loss CFM: 0.5436, Grad Norm: 5.2794, LR: 0.000019
Epoch 0, Iteration 11390, Loss: 5.3579, Loss AR: 4.7876, Loss CFM: 0.5703, Grad Norm: 4.4703, LR: 0.000019
Epoch 0, Iteration 11400, Loss: 4.8909, Loss AR: 4.3160, Loss CFM: 0.5748, Grad Norm: 4.6594, LR: 0.000019
Epoch 0, Iteration 11410, Loss: 5.0977, Loss AR: 4.5489, Loss CFM: 0.5488, Grad Norm: 4.8930, LR: 0.000019
Epoch 0, Iteration 11420, Loss: 5.0105, Loss AR: 4.4390, Loss CFM: 0.5715, Grad Norm: 2.9641, LR: 0.000019
Epoch 0, Iteration 11430, Loss: 4.9461, Loss AR: 4.3903, Loss CFM: 0.5558, Grad Norm: 5.0027, LR: 0.000019
Epoch 0, Iteration 11440, Loss: 5.2512, Loss AR: 4.7499, Loss CFM: 0.5013, Grad Norm: 5.1527, LR: 0.000019
Epoch 0, Iteration 11450, Loss: 5.0455, Loss AR: 4.4839, Loss CFM: 0.5615, Grad Norm: 3.9689, LR: 0.000019
Epoch 0, Iteration 11460, Loss: 4.9012, Loss AR: 4.3773, Loss CFM: 0.5238, Grad Norm: 3.6329, LR: 0.000019
Epoch 0, Iteration 11470, Loss: 4.9411, Loss AR: 4.3841, Loss CFM: 0.5570, Grad Norm: 4.4043, LR: 0.000019
Epoch 0, Iteration 11480, Loss: 5.2335, Loss AR: 4.7218, Loss CFM: 0.5117, Grad Norm: 4.8580, LR: 0.000019
Epoch 0, Iteration 11490, Loss: 4.9860, Loss AR: 4.4325, Loss CFM: 0.5535, Grad Norm: 4.2036, LR: 0.000019
Epoch 0, Iteration 11500, Loss: 4.9280, Loss AR: 4.3372, Loss CFM: 0.5908, Grad Norm: 3.9915, LR: 0.000019
Epoch 0, Iteration 11510, Loss: 5.0114, Loss AR: 4.4518, Loss CFM: 0.5595, Grad Norm: 4.7745, LR: 0.000019
Epoch 0, Iteration 11520, Loss: 5.1500, Loss AR: 4.5625, Loss CFM: 0.5875, Grad Norm: 4.6578, LR: 0.000019
Epoch 0, Iteration 11530, Loss: 4.9955, Loss AR: 4.4524, Loss CFM: 0.5431, Grad Norm: 3.6500, LR: 0.000019
Epoch 0, Iteration 11540, Loss: 5.0049, Loss AR: 4.3609, Loss CFM: 0.6440, Grad Norm: 3.9230, LR: 0.000019
Epoch 0, Iteration 11550, Loss: 5.0007, Loss AR: 4.4975, Loss CFM: 0.5033, Grad Norm: 3.8663, LR: 0.000019
Epoch 0, Iteration 11560, Loss: 4.8860, Loss AR: 4.3063, Loss CFM: 0.5797, Grad Norm: 4.6658, LR: 0.000019
Epoch 0, Iteration 11570, Loss: 5.1414, Loss AR: 4.6880, Loss CFM: 0.4534, Grad Norm: 5.5901, LR: 0.000019
Epoch 0, Iteration 11580, Loss: 5.0969, Loss AR: 4.4867, Loss CFM: 0.6101, Grad Norm: 4.1354, LR: 0.000019
Epoch 0, Iteration 11590, Loss: 5.1235, Loss AR: 4.6217, Loss CFM: 0.5017, Grad Norm: 3.8101, LR: 0.000019
Epoch 0, Iteration 11600, Loss: 4.8292, Loss AR: 4.2703, Loss CFM: 0.5589, Grad Norm: 4.0336, LR: 0.000019
Epoch 0, Iteration 11610, Loss: 4.9619, Loss AR: 4.3737, Loss CFM: 0.5882, Grad Norm: 4.5974, LR: 0.000019
Epoch 0, Iteration 11620, Loss: 5.0582, Loss AR: 4.4528, Loss CFM: 0.6054, Grad Norm: 4.3344, LR: 0.000019
Epoch 0, Iteration 11630, Loss: 4.9827, Loss AR: 4.4339, Loss CFM: 0.5487, Grad Norm: 4.1560, LR: 0.000019
Epoch 0, Iteration 11640, Loss: 5.2134, Loss AR: 4.6659, Loss CFM: 0.5474, Grad Norm: 4.6000, LR: 0.000019
Epoch 0, Iteration 11650, Loss: 5.2322, Loss AR: 4.6989, Loss CFM: 0.5332, Grad Norm: 3.3060, LR: 0.000019
Epoch 0, Iteration 11660, Loss: 5.3243, Loss AR: 4.7778, Loss CFM: 0.5464, Grad Norm: 3.5926, LR: 0.000019
Epoch 0, Iteration 11670, Loss: 5.0611, Loss AR: 4.5668, Loss CFM: 0.4943, Grad Norm: 6.3196, LR: 0.000019
Epoch 0, Iteration 11680, Loss: 5.2228, Loss AR: 4.6649, Loss CFM: 0.5579, Grad Norm: 4.8373, LR: 0.000019
Epoch 0, Iteration 11690, Loss: 5.3176, Loss AR: 4.8007, Loss CFM: 0.5169, Grad Norm: 5.9593, LR: 0.000019
Epoch 0, Iteration 11700, Loss: 4.9911, Loss AR: 4.4467, Loss CFM: 0.5444, Grad Norm: 4.3118, LR: 0.000019
Epoch 0, Iteration 11710, Loss: 5.0361, Loss AR: 4.5639, Loss CFM: 0.4722, Grad Norm: 3.4151, LR: 0.000019
Epoch 0, Iteration 11720, Loss: 5.2792, Loss AR: 4.7378, Loss CFM: 0.5414, Grad Norm: 3.7782, LR: 0.000019
Epoch 0, Iteration 11730, Loss: 5.2748, Loss AR: 4.7029, Loss CFM: 0.5719, Grad Norm: 3.5689, LR: 0.000019
Epoch 0, Iteration 11740, Loss: 4.8944, Loss AR: 4.3316, Loss CFM: 0.5628, Grad Norm: 5.1683, LR: 0.000019
Epoch 0, Iteration 11750, Loss: 5.1249, Loss AR: 4.5376, Loss CFM: 0.5873, Grad Norm: 4.4662, LR: 0.000019
Epoch 0, Iteration 11760, Loss: 5.0312, Loss AR: 4.4842, Loss CFM: 0.5469, Grad Norm: 3.3591, LR: 0.000019
Epoch 0, Iteration 11770, Loss: 5.3569, Loss AR: 4.7533, Loss CFM: 0.6036, Grad Norm: 4.2821, LR: 0.000019
Epoch 0, Iteration 11780, Loss: 5.1292, Loss AR: 4.5823, Loss CFM: 0.5469, Grad Norm: 3.7930, LR: 0.000019
Epoch 0, Iteration 11790, Loss: 5.1715, Loss AR: 4.6477, Loss CFM: 0.5238, Grad Norm: 4.8195, LR: 0.000019
Epoch 0, Iteration 11800, Loss: 5.1717, Loss AR: 4.6432, Loss CFM: 0.5285, Grad Norm: 6.4906, LR: 0.000019
Epoch 0, Iteration 11810, Loss: 5.0415, Loss AR: 4.5097, Loss CFM: 0.5318, Grad Norm: 4.2338, LR: 0.000019
Epoch 0, Iteration 11820, Loss: 5.2478, Loss AR: 4.7297, Loss CFM: 0.5181, Grad Norm: 5.4598, LR: 0.000019
Epoch 0, Iteration 11830, Loss: 5.1970, Loss AR: 4.5677, Loss CFM: 0.6293, Grad Norm: 3.4561, LR: 0.000019
Epoch 0, Iteration 11840, Loss: 4.9496, Loss AR: 4.3223, Loss CFM: 0.6273, Grad Norm: 3.6659, LR: 0.000019
Epoch 0, Iteration 11850, Loss: 5.0070, Loss AR: 4.4772, Loss CFM: 0.5298, Grad Norm: 3.8792, LR: 0.000019
Epoch 0, Iteration 11860, Loss: 5.0112, Loss AR: 4.5038, Loss CFM: 0.5074, Grad Norm: 4.1700, LR: 0.000019
Epoch 0, Iteration 11870, Loss: 4.9481, Loss AR: 4.4553, Loss CFM: 0.4927, Grad Norm: 3.4078, LR: 0.000019
Epoch 0, Iteration 11880, Loss: 5.1372, Loss AR: 4.5695, Loss CFM: 0.5677, Grad Norm: 3.1854, LR: 0.000019
Epoch 0, Iteration 11890, Loss: 5.0022, Loss AR: 4.4908, Loss CFM: 0.5114, Grad Norm: 4.7013, LR: 0.000019
Epoch 0, Iteration 11900, Loss: 5.2922, Loss AR: 4.7220, Loss CFM: 0.5702, Grad Norm: 3.6703, LR: 0.000019
Epoch 0, Iteration 11910, Loss: 5.4156, Loss AR: 4.8533, Loss CFM: 0.5622, Grad Norm: 4.6054, LR: 0.000019
Epoch 0, Iteration 11920, Loss: 4.8685, Loss AR: 4.2730, Loss CFM: 0.5954, Grad Norm: 4.8074, LR: 0.000019
Epoch 0, Iteration 11930, Loss: 5.2104, Loss AR: 4.6743, Loss CFM: 0.5361, Grad Norm: 5.0912, LR: 0.000019
Epoch 0, Iteration 11940, Loss: 5.0902, Loss AR: 4.5297, Loss CFM: 0.5605, Grad Norm: 4.2902, LR: 0.000019
Epoch 0, Iteration 11950, Loss: 5.0877, Loss AR: 4.5389, Loss CFM: 0.5487, Grad Norm: 5.0459, LR: 0.000019
Epoch 0, Iteration 11960, Loss: 4.9373, Loss AR: 4.4119, Loss CFM: 0.5254, Grad Norm: 5.1701, LR: 0.000019
Epoch 0, Iteration 11970, Loss: 5.1759, Loss AR: 4.5496, Loss CFM: 0.6263, Grad Norm: 4.9636, LR: 0.000019
Epoch 0, Iteration 11980, Loss: 5.2278, Loss AR: 4.6866, Loss CFM: 0.5413, Grad Norm: 3.9982, LR: 0.000019
Epoch 0, Iteration 11990, Loss: 5.1784, Loss AR: 4.5384, Loss CFM: 0.6400, Grad Norm: 3.9543, LR: 0.000019
Epoch 0, Iteration 12000, Loss: 5.0393, Loss AR: 4.4909, Loss CFM: 0.5484, Grad Norm: 3.4288, LR: 0.000019
Epoch 0, Iteration 12010, Loss: 5.1012, Loss AR: 4.5677, Loss CFM: 0.5335, Grad Norm: 4.3905, LR: 0.000019
Epoch 0, Iteration 12020, Loss: 5.5291, Loss AR: 4.7645, Loss CFM: 0.7646, Grad Norm: 4.5214, LR: 0.000019
Epoch 0, Iteration 12030, Loss: 5.4174, Loss AR: 4.8486, Loss CFM: 0.5688, Grad Norm: 5.5780, LR: 0.000019
Epoch 0, Iteration 12040, Loss: 5.0353, Loss AR: 4.5104, Loss CFM: 0.5249, Grad Norm: 4.0304, LR: 0.000019
Epoch 0, Iteration 12050, Loss: 5.2241, Loss AR: 4.7058, Loss CFM: 0.5183, Grad Norm: 3.6868, LR: 0.000019
Epoch 0, Iteration 12060, Loss: 5.0654, Loss AR: 4.5391, Loss CFM: 0.5263, Grad Norm: 3.4554, LR: 0.000019
Epoch 0, Iteration 12070, Loss: 4.9628, Loss AR: 4.4607, Loss CFM: 0.5021, Grad Norm: 4.3523, LR: 0.000019
Epoch 0, Iteration 12080, Loss: 5.3361, Loss AR: 4.7300, Loss CFM: 0.6061, Grad Norm: 6.0384, LR: 0.000019
Epoch 0, Iteration 12090, Loss: 5.0058, Loss AR: 4.4976, Loss CFM: 0.5082, Grad Norm: 5.3658, LR: 0.000019
Epoch 0, Iteration 12100, Loss: 5.0180, Loss AR: 4.4846, Loss CFM: 0.5334, Grad Norm: 5.1395, LR: 0.000019
Epoch 0, Iteration 12110, Loss: 5.7000, Loss AR: 5.1486, Loss CFM: 0.5514, Grad Norm: 5.1256, LR: 0.000019
Epoch 0, Iteration 12120, Loss: 5.1819, Loss AR: 4.6160, Loss CFM: 0.5660, Grad Norm: 3.3714, LR: 0.000019
Epoch 0, Iteration 12130, Loss: 5.1359, Loss AR: 4.5573, Loss CFM: 0.5785, Grad Norm: 4.6238, LR: 0.000019
Epoch 0, Iteration 12140, Loss: 5.3859, Loss AR: 4.8481, Loss CFM: 0.5378, Grad Norm: 6.3127, LR: 0.000019
Epoch 0, Iteration 12150, Loss: 5.3173, Loss AR: 4.7800, Loss CFM: 0.5373, Grad Norm: 4.5015, LR: 0.000019
Epoch 0, Iteration 12160, Loss: 5.0325, Loss AR: 4.5229, Loss CFM: 0.5096, Grad Norm: 4.4274, LR: 0.000019
Epoch 0, Iteration 12170, Loss: 5.0198, Loss AR: 4.5190, Loss CFM: 0.5008, Grad Norm: 4.6204, LR: 0.000019
Epoch 0, Iteration 12180, Loss: 4.8249, Loss AR: 4.2595, Loss CFM: 0.5654, Grad Norm: 3.7406, LR: 0.000019
Epoch 0, Iteration 12190, Loss: 4.9989, Loss AR: 4.4531, Loss CFM: 0.5458, Grad Norm: 5.2482, LR: 0.000019
Epoch 0, Iteration 12200, Loss: 5.0551, Loss AR: 4.4908, Loss CFM: 0.5643, Grad Norm: 3.3375, LR: 0.000019
Epoch 0, Iteration 12210, Loss: 5.1160, Loss AR: 4.6323, Loss CFM: 0.4838, Grad Norm: 4.5962, LR: 0.000019
Epoch 0, Iteration 12220, Loss: 4.9148, Loss AR: 4.3631, Loss CFM: 0.5516, Grad Norm: 4.4202, LR: 0.000019
Epoch 0, Iteration 12230, Loss: 5.2254, Loss AR: 4.6400, Loss CFM: 0.5854, Grad Norm: 3.4676, LR: 0.000019
Epoch 0, Iteration 12240, Loss: 5.0719, Loss AR: 4.5465, Loss CFM: 0.5254, Grad Norm: 5.1356, LR: 0.000019
Epoch 0, Iteration 12250, Loss: 5.1166, Loss AR: 4.5692, Loss CFM: 0.5474, Grad Norm: 3.3067, LR: 0.000019
Epoch 0, Iteration 12260, Loss: 5.0309, Loss AR: 4.4549, Loss CFM: 0.5760, Grad Norm: 4.2618, LR: 0.000019
Epoch 0, Iteration 12270, Loss: 5.0288, Loss AR: 4.4818, Loss CFM: 0.5470, Grad Norm: 3.8509, LR: 0.000019
Epoch 0, Iteration 12280, Loss: 4.9429, Loss AR: 4.4290, Loss CFM: 0.5139, Grad Norm: 3.7947, LR: 0.000019
Epoch 0, Iteration 12290, Loss: 5.0957, Loss AR: 4.5590, Loss CFM: 0.5367, Grad Norm: 4.9917, LR: 0.000019
Epoch 0, Iteration 12300, Loss: 5.0952, Loss AR: 4.5113, Loss CFM: 0.5838, Grad Norm: 3.5973, LR: 0.000019
Epoch 0, Iteration 12310, Loss: 5.2960, Loss AR: 4.6899, Loss CFM: 0.6061, Grad Norm: 4.5459, LR: 0.000019
Epoch 0, Iteration 12320, Loss: 4.9882, Loss AR: 4.4891, Loss CFM: 0.4991, Grad Norm: 3.1449, LR: 0.000019
Epoch 0, Iteration 12330, Loss: 5.0856, Loss AR: 4.5264, Loss CFM: 0.5592, Grad Norm: 3.7555, LR: 0.000019
Epoch 0, Iteration 12340, Loss: 5.1679, Loss AR: 4.5851, Loss CFM: 0.5828, Grad Norm: 3.7855, LR: 0.000019
Epoch 0, Iteration 12350, Loss: 5.1845, Loss AR: 4.6106, Loss CFM: 0.5739, Grad Norm: 3.4471, LR: 0.000019
Epoch 0, Iteration 12360, Loss: 4.8812, Loss AR: 4.3474, Loss CFM: 0.5338, Grad Norm: 3.2518, LR: 0.000019
Epoch 0, Iteration 12370, Loss: 5.2590, Loss AR: 4.6710, Loss CFM: 0.5880, Grad Norm: 3.5427, LR: 0.000019
Epoch 0, Iteration 12380, Loss: 5.2354, Loss AR: 4.7203, Loss CFM: 0.5152, Grad Norm: 4.4653, LR: 0.000019
Epoch 0, Iteration 12390, Loss: 4.9718, Loss AR: 4.3690, Loss CFM: 0.6027, Grad Norm: 3.4776, LR: 0.000019
Epoch 0, Iteration 12400, Loss: 4.8589, Loss AR: 4.3447, Loss CFM: 0.5141, Grad Norm: 3.9119, LR: 0.000019
Epoch 0, Iteration 12410, Loss: 5.0918, Loss AR: 4.5209, Loss CFM: 0.5708, Grad Norm: 3.9527, LR: 0.000019
Epoch 0, Iteration 12420, Loss: 5.3099, Loss AR: 4.8027, Loss CFM: 0.5072, Grad Norm: 4.6909, LR: 0.000019
Epoch 0, Iteration 12430, Loss: 5.1836, Loss AR: 4.6498, Loss CFM: 0.5338, Grad Norm: 3.5312, LR: 0.000019
Epoch 0, Iteration 12440, Loss: 4.9706, Loss AR: 4.4279, Loss CFM: 0.5428, Grad Norm: 4.3782, LR: 0.000019
Epoch 0, Iteration 12450, Loss: 5.1043, Loss AR: 4.5454, Loss CFM: 0.5589, Grad Norm: 6.1070, LR: 0.000019
Epoch 0, Iteration 12460, Loss: 5.1175, Loss AR: 4.6268, Loss CFM: 0.4907, Grad Norm: 2.9771, LR: 0.000019
Epoch 0, Iteration 12470, Loss: 5.2513, Loss AR: 4.7006, Loss CFM: 0.5507, Grad Norm: 2.8978, LR: 0.000019
Epoch 0, Iteration 12480, Loss: 4.9229, Loss AR: 4.4240, Loss CFM: 0.4989, Grad Norm: 4.5161, LR: 0.000019
Epoch 0, Iteration 12490, Loss: 5.1709, Loss AR: 4.7235, Loss CFM: 0.4473, Grad Norm: 3.7647, LR: 0.000019
Epoch 0, Iteration 12500, Loss: 4.9834, Loss AR: 4.4420, Loss CFM: 0.5415, Grad Norm: 3.2334, LR: 0.000019
Epoch 0, Iteration 12510, Loss: 5.1279, Loss AR: 4.6067, Loss CFM: 0.5212, Grad Norm: 4.3271, LR: 0.000019
Epoch 0, Iteration 12520, Loss: 5.2834, Loss AR: 4.6938, Loss CFM: 0.5896, Grad Norm: 3.9812, LR: 0.000019
Epoch 0, Iteration 12530, Loss: 4.9610, Loss AR: 4.4751, Loss CFM: 0.4859, Grad Norm: 7.3451, LR: 0.000019
Epoch 0, Iteration 12540, Loss: 4.9268, Loss AR: 4.3657, Loss CFM: 0.5610, Grad Norm: 4.2092, LR: 0.000019
Epoch 0, Iteration 12550, Loss: 5.1720, Loss AR: 4.5795, Loss CFM: 0.5925, Grad Norm: 3.1585, LR: 0.000019
Epoch 0, Iteration 12560, Loss: 5.1055, Loss AR: 4.5649, Loss CFM: 0.5406, Grad Norm: 3.6811, LR: 0.000019
Epoch 0, Iteration 12570, Loss: 5.1110, Loss AR: 4.6198, Loss CFM: 0.4912, Grad Norm: 3.8021, LR: 0.000019
Epoch 0, Iteration 12580, Loss: 4.9582, Loss AR: 4.4575, Loss CFM: 0.5007, Grad Norm: 4.1074, LR: 0.000019
Epoch 0, Iteration 12590, Loss: 5.3344, Loss AR: 4.7725, Loss CFM: 0.5619, Grad Norm: 6.6974, LR: 0.000019
Epoch 0, Iteration 12600, Loss: 5.2675, Loss AR: 4.7599, Loss CFM: 0.5076, Grad Norm: 5.2460, LR: 0.000019
Epoch 0, Iteration 12610, Loss: 5.2418, Loss AR: 4.7090, Loss CFM: 0.5328, Grad Norm: 3.7269, LR: 0.000019
Epoch 0, Iteration 12620, Loss: 4.9086, Loss AR: 4.3612, Loss CFM: 0.5474, Grad Norm: 3.8234, LR: 0.000019
Epoch 0, Iteration 12630, Loss: 5.2962, Loss AR: 4.7425, Loss CFM: 0.5537, Grad Norm: 4.7652, LR: 0.000019
Epoch 0, Iteration 12640, Loss: 4.7849, Loss AR: 4.1944, Loss CFM: 0.5905, Grad Norm: 4.5159, LR: 0.000019
Epoch 0, Iteration 12650, Loss: 5.1044, Loss AR: 4.5177, Loss CFM: 0.5867, Grad Norm: 4.1675, LR: 0.000019
Epoch 0, Iteration 12660, Loss: 5.0577, Loss AR: 4.5345, Loss CFM: 0.5231, Grad Norm: 4.8070, LR: 0.000019
Epoch 0, Iteration 12670, Loss: 5.8868, Loss AR: 5.3087, Loss CFM: 0.5781, Grad Norm: 4.3485, LR: 0.000019
Epoch 0, Iteration 12680, Loss: 5.0305, Loss AR: 4.5274, Loss CFM: 0.5031, Grad Norm: 3.2690, LR: 0.000019
Epoch 0, Iteration 12690, Loss: 5.0902, Loss AR: 4.5376, Loss CFM: 0.5526, Grad Norm: 3.8143, LR: 0.000019
Epoch 0, Iteration 12700, Loss: 5.2184, Loss AR: 4.6817, Loss CFM: 0.5368, Grad Norm: 4.7530, LR: 0.000019
Epoch 0, Iteration 12710, Loss: 4.9490, Loss AR: 4.3931, Loss CFM: 0.5559, Grad Norm: 6.8924, LR: 0.000019
Epoch 0, Iteration 12720, Loss: 5.2389, Loss AR: 4.5981, Loss CFM: 0.6409, Grad Norm: 3.7854, LR: 0.000019
Epoch 0, Iteration 12730, Loss: 4.8618, Loss AR: 4.3210, Loss CFM: 0.5408, Grad Norm: 3.9933, LR: 0.000019
Epoch 0, Iteration 12740, Loss: 5.1890, Loss AR: 4.5784, Loss CFM: 0.6106, Grad Norm: 4.6760, LR: 0.000019
Epoch 0, Iteration 12750, Loss: 4.9800, Loss AR: 4.4068, Loss CFM: 0.5733, Grad Norm: 4.4331, LR: 0.000019
Epoch 0, Iteration 12760, Loss: 4.9000, Loss AR: 4.3641, Loss CFM: 0.5359, Grad Norm: 2.8651, LR: 0.000019
Epoch 0, Iteration 12770, Loss: 4.8925, Loss AR: 4.3807, Loss CFM: 0.5118, Grad Norm: 6.8647, LR: 0.000019
Epoch 0, Iteration 12780, Loss: 5.2052, Loss AR: 4.6755, Loss CFM: 0.5297, Grad Norm: 4.0822, LR: 0.000019
Epoch 0, Iteration 12790, Loss: 5.0578, Loss AR: 4.5390, Loss CFM: 0.5188, Grad Norm: 3.5861, LR: 0.000019
Epoch 0, Iteration 12800, Loss: 5.2326, Loss AR: 4.6579, Loss CFM: 0.5747, Grad Norm: 3.8490, LR: 0.000019
Epoch 0, Iteration 12810, Loss: 5.0278, Loss AR: 4.4879, Loss CFM: 0.5399, Grad Norm: 3.2967, LR: 0.000019
Epoch 0, Iteration 12820, Loss: 4.8800, Loss AR: 4.3806, Loss CFM: 0.4994, Grad Norm: 4.9141, LR: 0.000019
Epoch 0, Iteration 12830, Loss: 5.2566, Loss AR: 4.7498, Loss CFM: 0.5068, Grad Norm: 4.3981, LR: 0.000019
Epoch 0, Iteration 12840, Loss: 5.0578, Loss AR: 4.5483, Loss CFM: 0.5095, Grad Norm: 3.7339, LR: 0.000019
Epoch 0, Iteration 12850, Loss: 5.0436, Loss AR: 4.5783, Loss CFM: 0.4653, Grad Norm: 3.5109, LR: 0.000019
Epoch 0, Iteration 12860, Loss: 4.8017, Loss AR: 4.2910, Loss CFM: 0.5107, Grad Norm: 3.5987, LR: 0.000019
Epoch 0, Iteration 12870, Loss: 5.2623, Loss AR: 4.7602, Loss CFM: 0.5020, Grad Norm: 3.6033, LR: 0.000019
Epoch 0, Iteration 12880, Loss: 5.0568, Loss AR: 4.5596, Loss CFM: 0.4971, Grad Norm: 4.6294, LR: 0.000019
Epoch 0, Iteration 12890, Loss: 5.0008, Loss AR: 4.4991, Loss CFM: 0.5017, Grad Norm: 4.2142, LR: 0.000019
Epoch 0, Iteration 12900, Loss: 5.2205, Loss AR: 4.6254, Loss CFM: 0.5951, Grad Norm: 3.6414, LR: 0.000019
Epoch 0, Iteration 12910, Loss: 5.0629, Loss AR: 4.5850, Loss CFM: 0.4779, Grad Norm: 3.1570, LR: 0.000019
Epoch 0, Iteration 12920, Loss: 5.1494, Loss AR: 4.5855, Loss CFM: 0.5639, Grad Norm: 4.3423, LR: 0.000019
Epoch 0, Iteration 12930, Loss: 5.1179, Loss AR: 4.5840, Loss CFM: 0.5340, Grad Norm: 3.9969, LR: 0.000019
Epoch 0, Iteration 12940, Loss: 5.0939, Loss AR: 4.5243, Loss CFM: 0.5696, Grad Norm: 3.6993, LR: 0.000019
Epoch 0, Iteration 12950, Loss: 4.8421, Loss AR: 4.3182, Loss CFM: 0.5239, Grad Norm: 5.5169, LR: 0.000019
Epoch 0, Iteration 12960, Loss: 5.0159, Loss AR: 4.4409, Loss CFM: 0.5750, Grad Norm: 5.0030, LR: 0.000019
Epoch 0, Iteration 12970, Loss: 5.1824, Loss AR: 4.6085, Loss CFM: 0.5739, Grad Norm: 4.3342, LR: 0.000019
Epoch 0, Iteration 12980, Loss: 5.2756, Loss AR: 4.7304, Loss CFM: 0.5453, Grad Norm: 3.2230, LR: 0.000019
Epoch 0, Iteration 12990, Loss: 5.3879, Loss AR: 4.8826, Loss CFM: 0.5053, Grad Norm: 4.3985, LR: 0.000019
Epoch 0, Iteration 13000, Loss: 4.8854, Loss AR: 4.3519, Loss CFM: 0.5335, Grad Norm: 3.4760, LR: 0.000019
Epoch 0, Iteration 13010, Loss: 5.0599, Loss AR: 4.4600, Loss CFM: 0.5999, Grad Norm: 2.6523, LR: 0.000019
Epoch 0, Iteration 13020, Loss: 5.0899, Loss AR: 4.5118, Loss CFM: 0.5781, Grad Norm: 3.5912, LR: 0.000019
Epoch 0, Iteration 13030, Loss: 5.0235, Loss AR: 4.4394, Loss CFM: 0.5840, Grad Norm: 4.1330, LR: 0.000019
Epoch 0, Iteration 13040, Loss: 5.1264, Loss AR: 4.5713, Loss CFM: 0.5551, Grad Norm: 3.2696, LR: 0.000019
Epoch 0, Iteration 13050, Loss: 4.8569, Loss AR: 4.3860, Loss CFM: 0.4709, Grad Norm: 3.4569, LR: 0.000019
Epoch 0, Iteration 13060, Loss: 5.0869, Loss AR: 4.5530, Loss CFM: 0.5339, Grad Norm: 6.3409, LR: 0.000019
Epoch 0, Iteration 13070, Loss: 5.0137, Loss AR: 4.4202, Loss CFM: 0.5934, Grad Norm: 3.6321, LR: 0.000019
Epoch 0, Iteration 13080, Loss: 5.0979, Loss AR: 4.4876, Loss CFM: 0.6103, Grad Norm: 4.1739, LR: 0.000019
Epoch 0, Iteration 13090, Loss: 5.2059, Loss AR: 4.6430, Loss CFM: 0.5629, Grad Norm: 4.5046, LR: 0.000019
Epoch 0, Iteration 13100, Loss: 5.1056, Loss AR: 4.6119, Loss CFM: 0.4937, Grad Norm: 3.1467, LR: 0.000019
Epoch 0, Iteration 13110, Loss: 5.0748, Loss AR: 4.5271, Loss CFM: 0.5478, Grad Norm: 3.3977, LR: 0.000019
Epoch 0, Iteration 13120, Loss: 4.9484, Loss AR: 4.3423, Loss CFM: 0.6061, Grad Norm: 4.6408, LR: 0.000019
Epoch 0, Iteration 13130, Loss: 5.2203, Loss AR: 4.6262, Loss CFM: 0.5941, Grad Norm: 4.5483, LR: 0.000019
Epoch 0, Iteration 13140, Loss: 5.0547, Loss AR: 4.4871, Loss CFM: 0.5676, Grad Norm: 5.5619, LR: 0.000019
Epoch 0, Iteration 13150, Loss: 4.8157, Loss AR: 4.3147, Loss CFM: 0.5010, Grad Norm: 2.7698, LR: 0.000019
Epoch 0, Iteration 13160, Loss: 5.0107, Loss AR: 4.4352, Loss CFM: 0.5755, Grad Norm: 3.8139, LR: 0.000019
Epoch 0, Iteration 13170, Loss: 4.9513, Loss AR: 4.4252, Loss CFM: 0.5261, Grad Norm: 4.1719, LR: 0.000019
Epoch 0, Iteration 13180, Loss: 5.1493, Loss AR: 4.6505, Loss CFM: 0.4988, Grad Norm: 3.7575, LR: 0.000019
Epoch 0, Iteration 13190, Loss: 5.2763, Loss AR: 4.6797, Loss CFM: 0.5966, Grad Norm: 6.3610, LR: 0.000019
Epoch 0, Iteration 13200, Loss: 4.9175, Loss AR: 4.4319, Loss CFM: 0.4855, Grad Norm: 3.7553, LR: 0.000019
Epoch 0, Iteration 13210, Loss: 5.2409, Loss AR: 4.6678, Loss CFM: 0.5731, Grad Norm: 4.8223, LR: 0.000019
Epoch 0, Iteration 13220, Loss: 4.8840, Loss AR: 4.3315, Loss CFM: 0.5525, Grad Norm: 4.1002, LR: 0.000019
Epoch 0, Iteration 13230, Loss: 5.0690, Loss AR: 4.4969, Loss CFM: 0.5721, Grad Norm: 2.5645, LR: 0.000019
Epoch 0, Iteration 13240, Loss: 5.0597, Loss AR: 4.5235, Loss CFM: 0.5362, Grad Norm: 3.4543, LR: 0.000019
Epoch 0, Iteration 13250, Loss: 4.9556, Loss AR: 4.4029, Loss CFM: 0.5527, Grad Norm: 5.1736, LR: 0.000019
Epoch 0, Iteration 13260, Loss: 4.9920, Loss AR: 4.4255, Loss CFM: 0.5665, Grad Norm: 4.3796, LR: 0.000019
Epoch 0, Iteration 13270, Loss: 4.9954, Loss AR: 4.5142, Loss CFM: 0.4812, Grad Norm: 3.9677, LR: 0.000019
Epoch 0, Iteration 13280, Loss: 5.0765, Loss AR: 4.5937, Loss CFM: 0.4828, Grad Norm: 3.7719, LR: 0.000019
Epoch 0, Iteration 13290, Loss: 4.9763, Loss AR: 4.4897, Loss CFM: 0.4866, Grad Norm: 4.4012, LR: 0.000019
Epoch 0, Iteration 13300, Loss: 5.3785, Loss AR: 4.8376, Loss CFM: 0.5409, Grad Norm: 4.1396, LR: 0.000019
Epoch 0, Iteration 13310, Loss: 5.1315, Loss AR: 4.5294, Loss CFM: 0.6022, Grad Norm: 4.6869, LR: 0.000019
Epoch 0, Iteration 13320, Loss: 5.3349, Loss AR: 4.7166, Loss CFM: 0.6183, Grad Norm: 4.4413, LR: 0.000019
Epoch 0, Iteration 13330, Loss: 5.0340, Loss AR: 4.5498, Loss CFM: 0.4842, Grad Norm: 4.4221, LR: 0.000019
Epoch 0, Iteration 13340, Loss: 4.9408, Loss AR: 4.3540, Loss CFM: 0.5868, Grad Norm: 4.2008, LR: 0.000019
Epoch 0, Iteration 13350, Loss: 5.1061, Loss AR: 4.5406, Loss CFM: 0.5655, Grad Norm: 7.2665, LR: 0.000019
Epoch 0, Iteration 13360, Loss: 5.1296, Loss AR: 4.5328, Loss CFM: 0.5968, Grad Norm: 4.3944, LR: 0.000019
Epoch 0, Iteration 13370, Loss: 5.1378, Loss AR: 4.6301, Loss CFM: 0.5077, Grad Norm: 4.1619, LR: 0.000019
Epoch 0, Iteration 13380, Loss: 5.1958, Loss AR: 4.6235, Loss CFM: 0.5724, Grad Norm: 4.8143, LR: 0.000019
Epoch 0, Iteration 13390, Loss: 5.0371, Loss AR: 4.5405, Loss CFM: 0.4966, Grad Norm: 3.1126, LR: 0.000019
Epoch 0, Iteration 13400, Loss: 4.9826, Loss AR: 4.4076, Loss CFM: 0.5750, Grad Norm: 4.4973, LR: 0.000019
Epoch 0, Iteration 13410, Loss: 4.9536, Loss AR: 4.4420, Loss CFM: 0.5116, Grad Norm: 5.2401, LR: 0.000019
Epoch 0, Iteration 13420, Loss: 4.9435, Loss AR: 4.3325, Loss CFM: 0.6109, Grad Norm: 4.3473, LR: 0.000019
Epoch 0, Iteration 13430, Loss: 5.0188, Loss AR: 4.4644, Loss CFM: 0.5544, Grad Norm: 5.2846, LR: 0.000019
Epoch 0, Iteration 13440, Loss: 4.8937, Loss AR: 4.3369, Loss CFM: 0.5568, Grad Norm: 3.9647, LR: 0.000019
Epoch 0, Iteration 13450, Loss: 5.1212, Loss AR: 4.5289, Loss CFM: 0.5923, Grad Norm: 3.9477, LR: 0.000019
Epoch 0, Iteration 13460, Loss: 5.1718, Loss AR: 4.5997, Loss CFM: 0.5721, Grad Norm: 4.0085, LR: 0.000019
Epoch 0, Iteration 13470, Loss: 5.1430, Loss AR: 4.6256, Loss CFM: 0.5175, Grad Norm: 4.3608, LR: 0.000019
Epoch 0, Iteration 13480, Loss: 5.0066, Loss AR: 4.4862, Loss CFM: 0.5204, Grad Norm: 3.8663, LR: 0.000019
Epoch 0, Iteration 13490, Loss: 4.7558, Loss AR: 4.2575, Loss CFM: 0.4983, Grad Norm: 5.3254, LR: 0.000019
Epoch 0, Iteration 13500, Loss: 5.0526, Loss AR: 4.5427, Loss CFM: 0.5099, Grad Norm: 3.4424, LR: 0.000019
Epoch 0, Iteration 13510, Loss: 5.0791, Loss AR: 4.6070, Loss CFM: 0.4721, Grad Norm: 5.8722, LR: 0.000019
Epoch 0, Iteration 13520, Loss: 4.9125, Loss AR: 4.4135, Loss CFM: 0.4990, Grad Norm: 3.9715, LR: 0.000019
Epoch 0, Iteration 13530, Loss: 4.9885, Loss AR: 4.4498, Loss CFM: 0.5388, Grad Norm: 4.2242, LR: 0.000019
Epoch 0, Iteration 13540, Loss: 4.8367, Loss AR: 4.3204, Loss CFM: 0.5164, Grad Norm: 4.1968, LR: 0.000019
Epoch 0, Iteration 13550, Loss: 5.2163, Loss AR: 4.6094, Loss CFM: 0.6070, Grad Norm: 4.0988, LR: 0.000019
Epoch 0, Iteration 13560, Loss: 5.0937, Loss AR: 4.5019, Loss CFM: 0.5918, Grad Norm: 4.5055, LR: 0.000019
Epoch 0, Iteration 13570, Loss: 5.0828, Loss AR: 4.5349, Loss CFM: 0.5479, Grad Norm: 5.0732, LR: 0.000019
Epoch 0, Iteration 13580, Loss: 5.2056, Loss AR: 4.6454, Loss CFM: 0.5602, Grad Norm: 3.4114, LR: 0.000019
Epoch 0, Iteration 13590, Loss: 5.3344, Loss AR: 4.8027, Loss CFM: 0.5317, Grad Norm: 3.7604, LR: 0.000019
Epoch 0, Iteration 13600, Loss: 5.1391, Loss AR: 4.6072, Loss CFM: 0.5319, Grad Norm: 3.8578, LR: 0.000019
Epoch 0, Iteration 13610, Loss: 5.0064, Loss AR: 4.4088, Loss CFM: 0.5976, Grad Norm: 4.0171, LR: 0.000019
Epoch 0, Iteration 13620, Loss: 5.1139, Loss AR: 4.6175, Loss CFM: 0.4964, Grad Norm: 3.5298, LR: 0.000019
Epoch 0, Iteration 13630, Loss: 5.1150, Loss AR: 4.5818, Loss CFM: 0.5332, Grad Norm: 4.7402, LR: 0.000019
Epoch 0, Iteration 13640, Loss: 4.8917, Loss AR: 4.3646, Loss CFM: 0.5271, Grad Norm: 4.0920, LR: 0.000019
Epoch 0, Iteration 13650, Loss: 4.8800, Loss AR: 4.3599, Loss CFM: 0.5201, Grad Norm: 4.9291, LR: 0.000019
Epoch 0, Iteration 13660, Loss: 4.9020, Loss AR: 4.3312, Loss CFM: 0.5707, Grad Norm: 4.0078, LR: 0.000019
Epoch 0, Iteration 13670, Loss: 4.8057, Loss AR: 4.3230, Loss CFM: 0.4827, Grad Norm: 3.5143, LR: 0.000019
Epoch 0, Iteration 13680, Loss: 4.9937, Loss AR: 4.4323, Loss CFM: 0.5614, Grad Norm: 2.9635, LR: 0.000019
Epoch 0, Iteration 13690, Loss: 5.1419, Loss AR: 4.5439, Loss CFM: 0.5980, Grad Norm: 5.3073, LR: 0.000019
Epoch 0, Iteration 13700, Loss: 4.9777, Loss AR: 4.4259, Loss CFM: 0.5518, Grad Norm: 4.6351, LR: 0.000019
Epoch 0, Iteration 13710, Loss: 4.9691, Loss AR: 4.4476, Loss CFM: 0.5214, Grad Norm: 4.5553, LR: 0.000019
Epoch 0, Iteration 13720, Loss: 4.9747, Loss AR: 4.4034, Loss CFM: 0.5713, Grad Norm: 5.9968, LR: 0.000019
Epoch 0, Iteration 13730, Loss: 4.9946, Loss AR: 4.4076, Loss CFM: 0.5870, Grad Norm: 4.1807, LR: 0.000019
Epoch 0, Iteration 13740, Loss: 4.9527, Loss AR: 4.4441, Loss CFM: 0.5086, Grad Norm: 4.3022, LR: 0.000019
Epoch 0, Iteration 13750, Loss: 5.1799, Loss AR: 4.6084, Loss CFM: 0.5715, Grad Norm: 4.4289, LR: 0.000019
Epoch 0, Iteration 13760, Loss: 5.0406, Loss AR: 4.4835, Loss CFM: 0.5571, Grad Norm: 3.2198, LR: 0.000019
Epoch 0, Iteration 13770, Loss: 5.1631, Loss AR: 4.5678, Loss CFM: 0.5953, Grad Norm: 4.4040, LR: 0.000019
Epoch 0, Iteration 13780, Loss: 5.1601, Loss AR: 4.6539, Loss CFM: 0.5063, Grad Norm: 4.3802, LR: 0.000019
Epoch 0, Iteration 13790, Loss: 5.2250, Loss AR: 4.6638, Loss CFM: 0.5613, Grad Norm: 3.8652, LR: 0.000019
Epoch 0, Iteration 13800, Loss: 5.1839, Loss AR: 4.6300, Loss CFM: 0.5539, Grad Norm: 4.0946, LR: 0.000019
Epoch 0, Iteration 13810, Loss: 5.0358, Loss AR: 4.4856, Loss CFM: 0.5502, Grad Norm: 5.7325, LR: 0.000019
Epoch 0, Iteration 13820, Loss: 4.9455, Loss AR: 4.4458, Loss CFM: 0.4996, Grad Norm: 3.2673, LR: 0.000019
Epoch 0, Iteration 13830, Loss: 4.9491, Loss AR: 4.2712, Loss CFM: 0.6779, Grad Norm: 3.7645, LR: 0.000019
Epoch 0, Iteration 13840, Loss: 5.1591, Loss AR: 4.6141, Loss CFM: 0.5450, Grad Norm: 3.1803, LR: 0.000019
Epoch 0, Iteration 13850, Loss: 4.8596, Loss AR: 4.3298, Loss CFM: 0.5297, Grad Norm: 3.4924, LR: 0.000019
Epoch 0, Iteration 13860, Loss: 4.9968, Loss AR: 4.4806, Loss CFM: 0.5162, Grad Norm: 3.9526, LR: 0.000019
Epoch 0, Iteration 13870, Loss: 5.1523, Loss AR: 4.5774, Loss CFM: 0.5749, Grad Norm: 3.6246, LR: 0.000019
Epoch 0, Iteration 13880, Loss: 4.9994, Loss AR: 4.4224, Loss CFM: 0.5771, Grad Norm: 3.9497, LR: 0.000019
Epoch 0, Iteration 13890, Loss: 5.0688, Loss AR: 4.5187, Loss CFM: 0.5500, Grad Norm: 3.9956, LR: 0.000019
Epoch 0, Iteration 13900, Loss: 5.1571, Loss AR: 4.6540, Loss CFM: 0.5031, Grad Norm: 3.9414, LR: 0.000019
Epoch 0, Iteration 13910, Loss: 5.2011, Loss AR: 4.6721, Loss CFM: 0.5290, Grad Norm: 6.6336, LR: 0.000019
Epoch 0, Iteration 13920, Loss: 5.1475, Loss AR: 4.5501, Loss CFM: 0.5974, Grad Norm: 3.7907, LR: 0.000019
Epoch 0, Iteration 13930, Loss: 5.0469, Loss AR: 4.4744, Loss CFM: 0.5725, Grad Norm: 3.6636, LR: 0.000019
Epoch 0, Iteration 13940, Loss: 4.9582, Loss AR: 4.4425, Loss CFM: 0.5157, Grad Norm: 3.5509, LR: 0.000019
Epoch 0, Iteration 13950, Loss: 5.0867, Loss AR: 4.4809, Loss CFM: 0.6058, Grad Norm: 5.2087, LR: 0.000019
Epoch 0, Iteration 13960, Loss: 4.9842, Loss AR: 4.5170, Loss CFM: 0.4672, Grad Norm: 4.0121, LR: 0.000019
Epoch 0, Iteration 13970, Loss: 5.1212, Loss AR: 4.6214, Loss CFM: 0.4998, Grad Norm: 3.4951, LR: 0.000019
Epoch 0, Iteration 13980, Loss: 5.1187, Loss AR: 4.5849, Loss CFM: 0.5337, Grad Norm: 3.5616, LR: 0.000019
Epoch 0, Iteration 13990, Loss: 5.2283, Loss AR: 4.6178, Loss CFM: 0.6105, Grad Norm: 4.6009, LR: 0.000019
Epoch 0, Iteration 14000, Loss: 5.3343, Loss AR: 4.7243, Loss CFM: 0.6100, Grad Norm: 3.7421, LR: 0.000019
Epoch 0, Iteration 14010, Loss: 5.3385, Loss AR: 4.7391, Loss CFM: 0.5994, Grad Norm: 4.8332, LR: 0.000019
Epoch 0, Iteration 14020, Loss: 5.0616, Loss AR: 4.4878, Loss CFM: 0.5738, Grad Norm: 5.0257, LR: 0.000019
Epoch 0, Iteration 14030, Loss: 5.1659, Loss AR: 4.5692, Loss CFM: 0.5967, Grad Norm: 3.7621, LR: 0.000019
Epoch 0, Iteration 14040, Loss: 4.7164, Loss AR: 4.1571, Loss CFM: 0.5593, Grad Norm: 4.8211, LR: 0.000019
Epoch 0, Iteration 14050, Loss: 5.2958, Loss AR: 4.6327, Loss CFM: 0.6631, Grad Norm: 4.0915, LR: 0.000019
Epoch 0, Iteration 14060, Loss: 5.1276, Loss AR: 4.4906, Loss CFM: 0.6370, Grad Norm: 5.2692, LR: 0.000019
Epoch 0, Iteration 14070, Loss: 4.9575, Loss AR: 4.4450, Loss CFM: 0.5124, Grad Norm: 5.7756, LR: 0.000019
Epoch 0, Iteration 14080, Loss: 5.0607, Loss AR: 4.5114, Loss CFM: 0.5493, Grad Norm: 4.8344, LR: 0.000019
Epoch 0, Iteration 14090, Loss: 5.1643, Loss AR: 4.6128, Loss CFM: 0.5515, Grad Norm: 3.4023, LR: 0.000019
Epoch 0, Iteration 14100, Loss: 5.3316, Loss AR: 4.8511, Loss CFM: 0.4805, Grad Norm: 4.5127, LR: 0.000019
Epoch 0, Iteration 14110, Loss: 5.1560, Loss AR: 4.5907, Loss CFM: 0.5653, Grad Norm: 3.4254, LR: 0.000019
Epoch 0, Iteration 14120, Loss: 5.0774, Loss AR: 4.5519, Loss CFM: 0.5255, Grad Norm: 6.8486, LR: 0.000019
Epoch 0, Iteration 14130, Loss: 5.2428, Loss AR: 4.6459, Loss CFM: 0.5969, Grad Norm: 5.1426, LR: 0.000019
Epoch 0, Iteration 14140, Loss: 5.0655, Loss AR: 4.5160, Loss CFM: 0.5495, Grad Norm: 4.0223, LR: 0.000019
Epoch 0, Iteration 14150, Loss: 4.9179, Loss AR: 4.4071, Loss CFM: 0.5108, Grad Norm: 4.0876, LR: 0.000019
Epoch 0, Iteration 14160, Loss: 5.3809, Loss AR: 4.8475, Loss CFM: 0.5334, Grad Norm: 6.6999, LR: 0.000019
Epoch 0, Iteration 14170, Loss: 5.1306, Loss AR: 4.5844, Loss CFM: 0.5463, Grad Norm: 3.5699, LR: 0.000019
Epoch 0, Iteration 14180, Loss: 5.2061, Loss AR: 4.6277, Loss CFM: 0.5785, Grad Norm: 3.6751, LR: 0.000019
Epoch 0, Iteration 14190, Loss: 4.9425, Loss AR: 4.3801, Loss CFM: 0.5623, Grad Norm: 5.3705, LR: 0.000019
Epoch 0, Iteration 14200, Loss: 5.2841, Loss AR: 4.7614, Loss CFM: 0.5227, Grad Norm: 4.1816, LR: 0.000019
Epoch 0, Iteration 14210, Loss: 5.1497, Loss AR: 4.6723, Loss CFM: 0.4774, Grad Norm: 3.3064, LR: 0.000019
Epoch 0, Iteration 14220, Loss: 5.3349, Loss AR: 4.7061, Loss CFM: 0.6288, Grad Norm: 7.9830, LR: 0.000019
Epoch 0, Iteration 14230, Loss: 5.0795, Loss AR: 4.5168, Loss CFM: 0.5628, Grad Norm: 4.4466, LR: 0.000019
Epoch 0, Iteration 14240, Loss: 5.1402, Loss AR: 4.5756, Loss CFM: 0.5646, Grad Norm: 4.8813, LR: 0.000019
Epoch 0, Iteration 14250, Loss: 5.2128, Loss AR: 4.6535, Loss CFM: 0.5594, Grad Norm: 3.6034, LR: 0.000019
Epoch 0, Iteration 14260, Loss: 5.1522, Loss AR: 4.5886, Loss CFM: 0.5636, Grad Norm: 3.8495, LR: 0.000019
Epoch 0, Iteration 14270, Loss: 5.2127, Loss AR: 4.6601, Loss CFM: 0.5526, Grad Norm: 4.0216, LR: 0.000019
Epoch 0, Iteration 14280, Loss: 4.9387, Loss AR: 4.4502, Loss CFM: 0.4885, Grad Norm: 4.6524, LR: 0.000019
Epoch 0, Iteration 14290, Loss: 5.1380, Loss AR: 4.5755, Loss CFM: 0.5625, Grad Norm: 4.5431, LR: 0.000019
Epoch 0, Iteration 14300, Loss: 5.1087, Loss AR: 4.6030, Loss CFM: 0.5057, Grad Norm: 3.4898, LR: 0.000019
Epoch 0, Iteration 14310, Loss: 5.1701, Loss AR: 4.5851, Loss CFM: 0.5850, Grad Norm: 3.6287, LR: 0.000019
Epoch 0, Iteration 14320, Loss: 4.9330, Loss AR: 4.3744, Loss CFM: 0.5586, Grad Norm: 4.1471, LR: 0.000019
Epoch 0, Iteration 14330, Loss: 5.1024, Loss AR: 4.5762, Loss CFM: 0.5262, Grad Norm: 6.4566, LR: 0.000019
Epoch 0, Iteration 14340, Loss: 5.1900, Loss AR: 4.6508, Loss CFM: 0.5392, Grad Norm: 3.4341, LR: 0.000019
Epoch 0, Iteration 14350, Loss: 5.0591, Loss AR: 4.4251, Loss CFM: 0.6339, Grad Norm: 5.7810, LR: 0.000019
Epoch 0, Iteration 14360, Loss: 5.2436, Loss AR: 4.6895, Loss CFM: 0.5541, Grad Norm: 4.0949, LR: 0.000019
Epoch 0, Iteration 14370, Loss: 4.8029, Loss AR: 4.2257, Loss CFM: 0.5772, Grad Norm: 5.4276, LR: 0.000019
Epoch 0, Iteration 14380, Loss: 4.9354, Loss AR: 4.3484, Loss CFM: 0.5871, Grad Norm: 3.9637, LR: 0.000019
Epoch 0, Iteration 14390, Loss: 5.3138, Loss AR: 4.7539, Loss CFM: 0.5598, Grad Norm: 4.1062, LR: 0.000019
Epoch 0, Iteration 14400, Loss: 4.9922, Loss AR: 4.5009, Loss CFM: 0.4912, Grad Norm: 4.3981, LR: 0.000019
Epoch 0, Iteration 14410, Loss: 5.1578, Loss AR: 4.6159, Loss CFM: 0.5420, Grad Norm: 3.2190, LR: 0.000019
Epoch 0, Iteration 14420, Loss: 5.1703, Loss AR: 4.5413, Loss CFM: 0.6290, Grad Norm: 5.5760, LR: 0.000019
Epoch 0, Iteration 14430, Loss: 5.0393, Loss AR: 4.4810, Loss CFM: 0.5583, Grad Norm: 4.7139, LR: 0.000019
Epoch 0, Iteration 14440, Loss: 5.1710, Loss AR: 4.6541, Loss CFM: 0.5170, Grad Norm: 4.6371, LR: 0.000019
Epoch 0, Iteration 14450, Loss: 5.2421, Loss AR: 4.7608, Loss CFM: 0.4814, Grad Norm: 5.9022, LR: 0.000019
Epoch 0, Iteration 14460, Loss: 5.2225, Loss AR: 4.7365, Loss CFM: 0.4860, Grad Norm: 5.5324, LR: 0.000019
Epoch 0, Iteration 14470, Loss: 4.8805, Loss AR: 4.2996, Loss CFM: 0.5809, Grad Norm: 5.0250, LR: 0.000019
Epoch 0, Iteration 14480, Loss: 5.0652, Loss AR: 4.4650, Loss CFM: 0.6003, Grad Norm: 4.2335, LR: 0.000019
Epoch 0, Iteration 14490, Loss: 5.1454, Loss AR: 4.6028, Loss CFM: 0.5426, Grad Norm: 3.6191, LR: 0.000019
Epoch 0, Iteration 14500, Loss: 5.2220, Loss AR: 4.6683, Loss CFM: 0.5537, Grad Norm: 4.6893, LR: 0.000019
Epoch 0, Iteration 14510, Loss: 4.9078, Loss AR: 4.4164, Loss CFM: 0.4914, Grad Norm: 5.2684, LR: 0.000019
Epoch 0, Iteration 14520, Loss: 5.1991, Loss AR: 4.5070, Loss CFM: 0.6921, Grad Norm: 4.4006, LR: 0.000019
Epoch 0, Iteration 14530, Loss: 4.8488, Loss AR: 4.3386, Loss CFM: 0.5102, Grad Norm: 3.6221, LR: 0.000019
Epoch 0, Iteration 14540, Loss: 5.4131, Loss AR: 4.8420, Loss CFM: 0.5712, Grad Norm: 4.1345, LR: 0.000019
Epoch 0, Iteration 14550, Loss: 5.1868, Loss AR: 4.6225, Loss CFM: 0.5643, Grad Norm: 3.4130, LR: 0.000019
Epoch 0, Iteration 14560, Loss: 5.1532, Loss AR: 4.6438, Loss CFM: 0.5093, Grad Norm: 3.8358, LR: 0.000019
Epoch 0, Iteration 14570, Loss: 5.0297, Loss AR: 4.4691, Loss CFM: 0.5606, Grad Norm: 4.3305, LR: 0.000019
Epoch 0, Iteration 14580, Loss: 5.2574, Loss AR: 4.6623, Loss CFM: 0.5951, Grad Norm: 3.9143, LR: 0.000019
Epoch 0, Iteration 14590, Loss: 5.0745, Loss AR: 4.5043, Loss CFM: 0.5702, Grad Norm: 4.6293, LR: 0.000019
Epoch 0, Iteration 14600, Loss: 5.0498, Loss AR: 4.4905, Loss CFM: 0.5593, Grad Norm: 3.7319, LR: 0.000019
Epoch 0, Iteration 14610, Loss: 5.0839, Loss AR: 4.5328, Loss CFM: 0.5511, Grad Norm: 4.2723, LR: 0.000019
Epoch 0, Iteration 14620, Loss: 4.9322, Loss AR: 4.3950, Loss CFM: 0.5373, Grad Norm: 3.8170, LR: 0.000019
Epoch 0, Iteration 14630, Loss: 4.8839, Loss AR: 4.3213, Loss CFM: 0.5627, Grad Norm: 4.2889, LR: 0.000019
Epoch 0, Iteration 14640, Loss: 5.0685, Loss AR: 4.4704, Loss CFM: 0.5981, Grad Norm: 3.4270, LR: 0.000019
Epoch 0, Iteration 14650, Loss: 5.1593, Loss AR: 4.5618, Loss CFM: 0.5975, Grad Norm: 3.0108, LR: 0.000019
Epoch 0, Iteration 14660, Loss: 4.8393, Loss AR: 4.2924, Loss CFM: 0.5469, Grad Norm: 4.0707, LR: 0.000019
Epoch 0, Iteration 14670, Loss: 4.9537, Loss AR: 4.3521, Loss CFM: 0.6016, Grad Norm: 4.0527, LR: 0.000019
Epoch 0, Iteration 14680, Loss: 4.8597, Loss AR: 4.3202, Loss CFM: 0.5395, Grad Norm: 3.8521, LR: 0.000019
Epoch 0, Iteration 14690, Loss: 5.1356, Loss AR: 4.6167, Loss CFM: 0.5189, Grad Norm: 4.3571, LR: 0.000019
Epoch 0, Iteration 14700, Loss: 5.3545, Loss AR: 4.8255, Loss CFM: 0.5290, Grad Norm: 3.5309, LR: 0.000019
Epoch 0, Iteration 14710, Loss: 5.2093, Loss AR: 4.6696, Loss CFM: 0.5397, Grad Norm: 3.1221, LR: 0.000019
Epoch 0, Iteration 14720, Loss: 5.2706, Loss AR: 4.7164, Loss CFM: 0.5543, Grad Norm: 4.6565, LR: 0.000019
Epoch 0, Iteration 14730, Loss: 4.8913, Loss AR: 4.3924, Loss CFM: 0.4989, Grad Norm: 4.2834, LR: 0.000019
Epoch 0, Iteration 14740, Loss: 4.9877, Loss AR: 4.4989, Loss CFM: 0.4888, Grad Norm: 3.4368, LR: 0.000019
Epoch 0, Iteration 14750, Loss: 4.8939, Loss AR: 4.4125, Loss CFM: 0.4814, Grad Norm: 15.5473, LR: 0.000019
Epoch 0, Iteration 14760, Loss: 4.9598, Loss AR: 4.3938, Loss CFM: 0.5661, Grad Norm: 3.4845, LR: 0.000019
Epoch 0, Iteration 14770, Loss: 5.2925, Loss AR: 4.7118, Loss CFM: 0.5807, Grad Norm: 5.5248, LR: 0.000019
Epoch 0, Iteration 14780, Loss: 4.9470, Loss AR: 4.4046, Loss CFM: 0.5424, Grad Norm: 3.0921, LR: 0.000019
Epoch 0, Iteration 14790, Loss: 4.9521, Loss AR: 4.3988, Loss CFM: 0.5534, Grad Norm: 3.7681, LR: 0.000019
Epoch 0, Iteration 14800, Loss: 4.9145, Loss AR: 4.3685, Loss CFM: 0.5461, Grad Norm: 4.2969, LR: 0.000019
Epoch 0, Iteration 14810, Loss: 5.2725, Loss AR: 4.6623, Loss CFM: 0.6102, Grad Norm: 5.3889, LR: 0.000019
Epoch 0, Iteration 14820, Loss: 5.2144, Loss AR: 4.6043, Loss CFM: 0.6101, Grad Norm: 4.4292, LR: 0.000019
Epoch 0, Iteration 14830, Loss: 5.1725, Loss AR: 4.6131, Loss CFM: 0.5594, Grad Norm: 5.1212, LR: 0.000019
Epoch 0, Iteration 14840, Loss: 4.9750, Loss AR: 4.3403, Loss CFM: 0.6347, Grad Norm: 8.9493, LR: 0.000019
Epoch 0, Iteration 14850, Loss: 5.0389, Loss AR: 4.5261, Loss CFM: 0.5128, Grad Norm: 4.1437, LR: 0.000019
Epoch 0, Iteration 14860, Loss: 5.3391, Loss AR: 4.7548, Loss CFM: 0.5843, Grad Norm: 4.3292, LR: 0.000019
Epoch 0, Iteration 14870, Loss: 4.9140, Loss AR: 4.3439, Loss CFM: 0.5701, Grad Norm: 3.9964, LR: 0.000019
Epoch 0, Iteration 14880, Loss: 5.2302, Loss AR: 4.7409, Loss CFM: 0.4892, Grad Norm: 3.9792, LR: 0.000019
Epoch 0, Iteration 14890, Loss: 4.8733, Loss AR: 4.3426, Loss CFM: 0.5307, Grad Norm: 3.8104, LR: 0.000019
Epoch 0, Iteration 14900, Loss: 4.9883, Loss AR: 4.4217, Loss CFM: 0.5666, Grad Norm: 3.5086, LR: 0.000019
Epoch 0, Iteration 14910, Loss: 5.0388, Loss AR: 4.4576, Loss CFM: 0.5812, Grad Norm: 3.9461, LR: 0.000019
Epoch 0, Iteration 14920, Loss: 5.1983, Loss AR: 4.6410, Loss CFM: 0.5573, Grad Norm: 4.4113, LR: 0.000019
Epoch 0, Iteration 14930, Loss: 5.0404, Loss AR: 4.4713, Loss CFM: 0.5691, Grad Norm: 3.4072, LR: 0.000019
Epoch 0, Iteration 14940, Loss: 4.8729, Loss AR: 4.3786, Loss CFM: 0.4944, Grad Norm: 4.1395, LR: 0.000019
Epoch 0, Iteration 14950, Loss: 5.0088, Loss AR: 4.4986, Loss CFM: 0.5103, Grad Norm: 3.8332, LR: 0.000019
Epoch 0, Iteration 14960, Loss: 5.1750, Loss AR: 4.6365, Loss CFM: 0.5386, Grad Norm: 4.3177, LR: 0.000019
Epoch 0, Iteration 14970, Loss: 5.0288, Loss AR: 4.4843, Loss CFM: 0.5445, Grad Norm: 4.3883, LR: 0.000019
Epoch 0, Iteration 14980, Loss: 5.1843, Loss AR: 4.6796, Loss CFM: 0.5047, Grad Norm: 4.7926, LR: 0.000019
Epoch 0, Iteration 14990, Loss: 5.3099, Loss AR: 4.7049, Loss CFM: 0.6050, Grad Norm: 3.2323, LR: 0.000019
Epoch 0, Iteration 15000, Loss: 5.3313, Loss AR: 4.8188, Loss CFM: 0.5125, Grad Norm: 4.8615, LR: 0.000019
Epoch 0, Iteration 15010, Loss: 5.0475, Loss AR: 4.5578, Loss CFM: 0.4897, Grad Norm: 3.1480, LR: 0.000019
Epoch 0, Iteration 15020, Loss: 5.1673, Loss AR: 4.5391, Loss CFM: 0.6282, Grad Norm: 3.9370, LR: 0.000019
Epoch 0, Iteration 15030, Loss: 5.2921, Loss AR: 4.7672, Loss CFM: 0.5248, Grad Norm: 3.7058, LR: 0.000019
Epoch 0, Iteration 15040, Loss: 4.9337, Loss AR: 4.4488, Loss CFM: 0.4849, Grad Norm: 3.8950, LR: 0.000019
Epoch 0, Iteration 15050, Loss: 5.1471, Loss AR: 4.6036, Loss CFM: 0.5435, Grad Norm: 4.3297, LR: 0.000019
Epoch 0, Iteration 15060, Loss: 4.8470, Loss AR: 4.3536, Loss CFM: 0.4934, Grad Norm: 4.3922, LR: 0.000019
Epoch 0, Iteration 15070, Loss: 4.8442, Loss AR: 4.2921, Loss CFM: 0.5521, Grad Norm: 3.0372, LR: 0.000019
Epoch 0, Iteration 15080, Loss: 5.0385, Loss AR: 4.4350, Loss CFM: 0.6035, Grad Norm: 3.2287, LR: 0.000019
Epoch 0, Iteration 15090, Loss: 5.2841, Loss AR: 4.7528, Loss CFM: 0.5313, Grad Norm: 4.2263, LR: 0.000019
Epoch 0, Iteration 15100, Loss: 5.0357, Loss AR: 4.5056, Loss CFM: 0.5301, Grad Norm: 3.6744, LR: 0.000019
Epoch 0, Iteration 15110, Loss: 4.8268, Loss AR: 4.3531, Loss CFM: 0.4737, Grad Norm: 4.3279, LR: 0.000019
Epoch 0, Iteration 15120, Loss: 5.1130, Loss AR: 4.5775, Loss CFM: 0.5356, Grad Norm: 3.2759, LR: 0.000019
Epoch 0, Iteration 15130, Loss: 4.9715, Loss AR: 4.3921, Loss CFM: 0.5795, Grad Norm: 3.0764, LR: 0.000019
Epoch 0, Iteration 15140, Loss: 4.9725, Loss AR: 4.4593, Loss CFM: 0.5132, Grad Norm: 3.3375, LR: 0.000019
Epoch 0, Iteration 15150, Loss: 5.2372, Loss AR: 4.7081, Loss CFM: 0.5291, Grad Norm: 4.2438, LR: 0.000019
Epoch 0, Iteration 15160, Loss: 4.9877, Loss AR: 4.3888, Loss CFM: 0.5989, Grad Norm: 4.0975, LR: 0.000019
Epoch 0, Iteration 15170, Loss: 5.0474, Loss AR: 4.5665, Loss CFM: 0.4809, Grad Norm: 3.4801, LR: 0.000019
Epoch 0, Iteration 15180, Loss: 5.2991, Loss AR: 4.7527, Loss CFM: 0.5464, Grad Norm: 3.6524, LR: 0.000019
Epoch 0, Iteration 15190, Loss: 5.2476, Loss AR: 4.7147, Loss CFM: 0.5328, Grad Norm: 4.4054, LR: 0.000019
Epoch 0, Iteration 15200, Loss: 5.1822, Loss AR: 4.5870, Loss CFM: 0.5951, Grad Norm: 4.9898, LR: 0.000019
Epoch 0, Iteration 15210, Loss: 5.3098, Loss AR: 4.7494, Loss CFM: 0.5604, Grad Norm: 3.7645, LR: 0.000019
Epoch 0, Iteration 15220, Loss: 5.1141, Loss AR: 4.5563, Loss CFM: 0.5578, Grad Norm: 3.5765, LR: 0.000019
Epoch 0, Iteration 15230, Loss: 4.9178, Loss AR: 4.4402, Loss CFM: 0.4776, Grad Norm: 4.0564, LR: 0.000019
Epoch 0, Iteration 15240, Loss: 4.8599, Loss AR: 4.2654, Loss CFM: 0.5944, Grad Norm: 4.9716, LR: 0.000019
Epoch 0, Iteration 15250, Loss: 5.0099, Loss AR: 4.4680, Loss CFM: 0.5419, Grad Norm: 3.5722, LR: 0.000019
Epoch 0, Iteration 15260, Loss: 5.1810, Loss AR: 4.6704, Loss CFM: 0.5106, Grad Norm: 4.4520, LR: 0.000019
Epoch 0, Iteration 15270, Loss: 4.9277, Loss AR: 4.3729, Loss CFM: 0.5548, Grad Norm: 4.0954, LR: 0.000019
Epoch 0, Iteration 15280, Loss: 5.2776, Loss AR: 4.6939, Loss CFM: 0.5838, Grad Norm: 4.8789, LR: 0.000019
Epoch 0, Iteration 15290, Loss: 5.2107, Loss AR: 4.6606, Loss CFM: 0.5500, Grad Norm: 4.0331, LR: 0.000019
Epoch 0, Iteration 15300, Loss: 4.9743, Loss AR: 4.5167, Loss CFM: 0.4576, Grad Norm: 5.0819, LR: 0.000019
Epoch 0, Iteration 15310, Loss: 4.9965, Loss AR: 4.4688, Loss CFM: 0.5276, Grad Norm: 4.4162, LR: 0.000019
Epoch 0, Iteration 15320, Loss: 5.1576, Loss AR: 4.6497, Loss CFM: 0.5079, Grad Norm: 3.8238, LR: 0.000019
Epoch 0, Iteration 15330, Loss: 5.1635, Loss AR: 4.6506, Loss CFM: 0.5129, Grad Norm: 6.0618, LR: 0.000019
Epoch 0, Iteration 15340, Loss: 5.0861, Loss AR: 4.5499, Loss CFM: 0.5362, Grad Norm: 3.4067, LR: 0.000019
Epoch 0, Iteration 15350, Loss: 5.4118, Loss AR: 4.8631, Loss CFM: 0.5487, Grad Norm: 4.6244, LR: 0.000019
Epoch 0, Iteration 15360, Loss: 5.1448, Loss AR: 4.5650, Loss CFM: 0.5798, Grad Norm: 3.3144, LR: 0.000019
Epoch 0, Iteration 15370, Loss: 5.2604, Loss AR: 4.7405, Loss CFM: 0.5199, Grad Norm: 3.9666, LR: 0.000019
Epoch 0, Iteration 15380, Loss: 5.1712, Loss AR: 4.5750, Loss CFM: 0.5962, Grad Norm: 4.4310, LR: 0.000019
Epoch 0, Iteration 15390, Loss: 5.1945, Loss AR: 4.6163, Loss CFM: 0.5782, Grad Norm: 3.2034, LR: 0.000019
Epoch 0, Iteration 15400, Loss: 4.9141, Loss AR: 4.3925, Loss CFM: 0.5216, Grad Norm: 3.3661, LR: 0.000019
Epoch 0, Iteration 15410, Loss: 5.2860, Loss AR: 4.7733, Loss CFM: 0.5127, Grad Norm: 3.9863, LR: 0.000019
Epoch 0, Iteration 15420, Loss: 5.0462, Loss AR: 4.4794, Loss CFM: 0.5669, Grad Norm: 4.7339, LR: 0.000019
Epoch 0, Iteration 15430, Loss: 5.0916, Loss AR: 4.5679, Loss CFM: 0.5237, Grad Norm: 3.3490, LR: 0.000019
Epoch 0, Iteration 15440, Loss: 4.9574, Loss AR: 4.4647, Loss CFM: 0.4927, Grad Norm: 4.2652, LR: 0.000019
Epoch 0, Iteration 15450, Loss: 5.3317, Loss AR: 4.7609, Loss CFM: 0.5708, Grad Norm: 3.5873, LR: 0.000019
Epoch 0, Iteration 15460, Loss: 4.9313, Loss AR: 4.3970, Loss CFM: 0.5343, Grad Norm: 5.9766, LR: 0.000019
Epoch 0, Iteration 15470, Loss: 5.2674, Loss AR: 4.6761, Loss CFM: 0.5913, Grad Norm: 3.0017, LR: 0.000019
Epoch 0, Iteration 15480, Loss: 5.1235, Loss AR: 4.5960, Loss CFM: 0.5275, Grad Norm: 3.9834, LR: 0.000019
Epoch 0, Iteration 15490, Loss: 5.0446, Loss AR: 4.5311, Loss CFM: 0.5135, Grad Norm: 5.7809, LR: 0.000019
Epoch 0, Iteration 15500, Loss: 5.2729, Loss AR: 4.7744, Loss CFM: 0.4985, Grad Norm: 4.2606, LR: 0.000019
Epoch 0, Iteration 15510, Loss: 4.9542, Loss AR: 4.4357, Loss CFM: 0.5185, Grad Norm: 4.4092, LR: 0.000019
Epoch 0, Iteration 15520, Loss: 5.1167, Loss AR: 4.6379, Loss CFM: 0.4787, Grad Norm: 5.0901, LR: 0.000019
Epoch 0, Iteration 15530, Loss: 5.3443, Loss AR: 4.8464, Loss CFM: 0.4978, Grad Norm: 3.9728, LR: 0.000019
Epoch 0, Iteration 15540, Loss: 4.9558, Loss AR: 4.3737, Loss CFM: 0.5820, Grad Norm: 3.3626, LR: 0.000019
Epoch 0, Iteration 15550, Loss: 5.1094, Loss AR: 4.5557, Loss CFM: 0.5537, Grad Norm: 3.4175, LR: 0.000019
Epoch 0, Iteration 15560, Loss: 5.0286, Loss AR: 4.4838, Loss CFM: 0.5448, Grad Norm: 3.8160, LR: 0.000019
Epoch 0, Iteration 15570, Loss: 4.9960, Loss AR: 4.4570, Loss CFM: 0.5390, Grad Norm: 4.4280, LR: 0.000019
Epoch 0, Iteration 15580, Loss: 4.9655, Loss AR: 4.4662, Loss CFM: 0.4992, Grad Norm: 3.7741, LR: 0.000019
Epoch 0, Iteration 15590, Loss: 4.8485, Loss AR: 4.2646, Loss CFM: 0.5839, Grad Norm: 9.2567, LR: 0.000019
Epoch 0, Iteration 15600, Loss: 5.0541, Loss AR: 4.4816, Loss CFM: 0.5725, Grad Norm: 9.6874, LR: 0.000019
Epoch 0, Iteration 15610, Loss: 5.1866, Loss AR: 4.6389, Loss CFM: 0.5477, Grad Norm: 3.5979, LR: 0.000019
Epoch 0, Iteration 15620, Loss: 4.9724, Loss AR: 4.4790, Loss CFM: 0.4934, Grad Norm: 4.3554, LR: 0.000019
Epoch 0, Iteration 15630, Loss: 5.0765, Loss AR: 4.5171, Loss CFM: 0.5594, Grad Norm: 4.3137, LR: 0.000019
Epoch 0, Iteration 15640, Loss: 4.8589, Loss AR: 4.3730, Loss CFM: 0.4859, Grad Norm: 4.1497, LR: 0.000019
Epoch 0, Iteration 15650, Loss: 4.8890, Loss AR: 4.3469, Loss CFM: 0.5422, Grad Norm: 3.2853, LR: 0.000019
Epoch 0, Iteration 15660, Loss: 5.0641, Loss AR: 4.5048, Loss CFM: 0.5592, Grad Norm: 3.2485, LR: 0.000019
Epoch 0, Iteration 15670, Loss: 4.9225, Loss AR: 4.3504, Loss CFM: 0.5722, Grad Norm: 3.0774, LR: 0.000019
Epoch 0, Iteration 15680, Loss: 5.3913, Loss AR: 4.8662, Loss CFM: 0.5251, Grad Norm: 3.9802, LR: 0.000019
Epoch 0, Iteration 15690, Loss: 4.9511, Loss AR: 4.3410, Loss CFM: 0.6100, Grad Norm: 3.4939, LR: 0.000019
Epoch 0, Iteration 15700, Loss: 5.2635, Loss AR: 4.6417, Loss CFM: 0.6218, Grad Norm: 4.1515, LR: 0.000019
Epoch 0, Iteration 15710, Loss: 5.2748, Loss AR: 4.6843, Loss CFM: 0.5905, Grad Norm: 6.2183, LR: 0.000019
Epoch 0, Iteration 15720, Loss: 5.0673, Loss AR: 4.5299, Loss CFM: 0.5374, Grad Norm: 3.0776, LR: 0.000019
Epoch 0, Iteration 15730, Loss: 4.9819, Loss AR: 4.4349, Loss CFM: 0.5470, Grad Norm: 3.7053, LR: 0.000019
Epoch 0, Iteration 15740, Loss: 5.0799, Loss AR: 4.5224, Loss CFM: 0.5575, Grad Norm: 3.1167, LR: 0.000019
Epoch 0, Iteration 15750, Loss: 4.9285, Loss AR: 4.3679, Loss CFM: 0.5607, Grad Norm: 3.6542, LR: 0.000019
Epoch 0, Iteration 15760, Loss: 5.0376, Loss AR: 4.5492, Loss CFM: 0.4884, Grad Norm: 4.3022, LR: 0.000019
Epoch 0, Iteration 15770, Loss: 5.0524, Loss AR: 4.5299, Loss CFM: 0.5224, Grad Norm: 4.3548, LR: 0.000019
Epoch 0, Iteration 15780, Loss: 5.0747, Loss AR: 4.5316, Loss CFM: 0.5431, Grad Norm: 6.0117, LR: 0.000019
Epoch 0, Iteration 15790, Loss: 5.0630, Loss AR: 4.5258, Loss CFM: 0.5372, Grad Norm: 3.4783, LR: 0.000019
Epoch 0, Iteration 15800, Loss: 5.2348, Loss AR: 4.6369, Loss CFM: 0.5979, Grad Norm: 3.9991, LR: 0.000019
Epoch 0, Iteration 15810, Loss: 5.1310, Loss AR: 4.6043, Loss CFM: 0.5267, Grad Norm: 4.2456, LR: 0.000019
Epoch 0, Iteration 15820, Loss: 4.9850, Loss AR: 4.4049, Loss CFM: 0.5801, Grad Norm: 3.6451, LR: 0.000019
Epoch 0, Iteration 15830, Loss: 5.1294, Loss AR: 4.5587, Loss CFM: 0.5707, Grad Norm: 3.8570, LR: 0.000019
Epoch 0, Iteration 15840, Loss: 4.9288, Loss AR: 4.4193, Loss CFM: 0.5096, Grad Norm: 8.7278, LR: 0.000019
Epoch 0, Iteration 15850, Loss: 4.9961, Loss AR: 4.5016, Loss CFM: 0.4945, Grad Norm: 4.1121, LR: 0.000019
Epoch 0, Iteration 15860, Loss: 5.3154, Loss AR: 4.8066, Loss CFM: 0.5087, Grad Norm: 4.2072, LR: 0.000019
Epoch 0, Iteration 15870, Loss: 5.1655, Loss AR: 4.6826, Loss CFM: 0.4830, Grad Norm: 4.3744, LR: 0.000019
Epoch 0, Iteration 15880, Loss: 5.0690, Loss AR: 4.5543, Loss CFM: 0.5148, Grad Norm: 5.0391, LR: 0.000019
Epoch 0, Iteration 15890, Loss: 5.2484, Loss AR: 4.7175, Loss CFM: 0.5309, Grad Norm: 3.7077, LR: 0.000019
Epoch 0, Iteration 15900, Loss: 5.0054, Loss AR: 4.4446, Loss CFM: 0.5608, Grad Norm: 4.2475, LR: 0.000019
Epoch 0, Iteration 15910, Loss: 4.9377, Loss AR: 4.3557, Loss CFM: 0.5820, Grad Norm: 4.8428, LR: 0.000019
Epoch 0, Iteration 15920, Loss: 5.0859, Loss AR: 4.5849, Loss CFM: 0.5010, Grad Norm: 3.9922, LR: 0.000019
Epoch 0, Iteration 15930, Loss: 4.7602, Loss AR: 4.2384, Loss CFM: 0.5218, Grad Norm: 3.2352, LR: 0.000019
Epoch 0, Iteration 15940, Loss: 4.9385, Loss AR: 4.4322, Loss CFM: 0.5063, Grad Norm: 4.1691, LR: 0.000019
Epoch 0, Iteration 15950, Loss: 5.1898, Loss AR: 4.6194, Loss CFM: 0.5704, Grad Norm: 5.7473, LR: 0.000019
Epoch 0, Iteration 15960, Loss: 4.9883, Loss AR: 4.4722, Loss CFM: 0.5161, Grad Norm: 4.0396, LR: 0.000019
Epoch 0, Iteration 15970, Loss: 5.1418, Loss AR: 4.6277, Loss CFM: 0.5142, Grad Norm: 4.1142, LR: 0.000019
Epoch 0, Iteration 15980, Loss: 5.2196, Loss AR: 4.6665, Loss CFM: 0.5531, Grad Norm: 5.7628, LR: 0.000019
Epoch 0, Iteration 15990, Loss: 5.1228, Loss AR: 4.5631, Loss CFM: 0.5598, Grad Norm: 6.5311, LR: 0.000019
Epoch 0, Iteration 16000, Loss: 5.1200, Loss AR: 4.5941, Loss CFM: 0.5260, Grad Norm: 3.9235, LR: 0.000019
Epoch 0, Iteration 16010, Loss: 5.0476, Loss AR: 4.5143, Loss CFM: 0.5333, Grad Norm: 3.2533, LR: 0.000019
Epoch 0, Iteration 16020, Loss: 5.0739, Loss AR: 4.5905, Loss CFM: 0.4834, Grad Norm: 3.3808, LR: 0.000019
Epoch 0, Iteration 16030, Loss: 5.1968, Loss AR: 4.6012, Loss CFM: 0.5956, Grad Norm: 4.9902, LR: 0.000019
Epoch 0, Iteration 16040, Loss: 4.9795, Loss AR: 4.5014, Loss CFM: 0.4781, Grad Norm: 5.1453, LR: 0.000019
Epoch 0, Iteration 16050, Loss: 5.0915, Loss AR: 4.5425, Loss CFM: 0.5490, Grad Norm: 5.1310, LR: 0.000019
Epoch 0, Iteration 16060, Loss: 5.2850, Loss AR: 4.7559, Loss CFM: 0.5291, Grad Norm: 4.2070, LR: 0.000019
Epoch 0, Iteration 16070, Loss: 5.1108, Loss AR: 4.5949, Loss CFM: 0.5159, Grad Norm: 3.9911, LR: 0.000019
Epoch 0, Iteration 16080, Loss: 5.1365, Loss AR: 4.5496, Loss CFM: 0.5869, Grad Norm: 3.6476, LR: 0.000019
Epoch 0, Iteration 16090, Loss: 5.2261, Loss AR: 4.7305, Loss CFM: 0.4957, Grad Norm: 4.3955, LR: 0.000019
Epoch 0, Iteration 16100, Loss: 4.8890, Loss AR: 4.3067, Loss CFM: 0.5823, Grad Norm: 6.4894, LR: 0.000019
Epoch 0, Iteration 16110, Loss: 5.2315, Loss AR: 4.6327, Loss CFM: 0.5989, Grad Norm: 8.1024, LR: 0.000019
Epoch 0, Iteration 16120, Loss: 4.7378, Loss AR: 4.2257, Loss CFM: 0.5121, Grad Norm: 4.1300, LR: 0.000019
Epoch 0, Iteration 16130, Loss: 5.2543, Loss AR: 4.7311, Loss CFM: 0.5232, Grad Norm: 3.0863, LR: 0.000019
Epoch 0, Iteration 16140, Loss: 5.1819, Loss AR: 4.5956, Loss CFM: 0.5863, Grad Norm: 4.3383, LR: 0.000019
Epoch 0, Iteration 16150, Loss: 5.1757, Loss AR: 4.5989, Loss CFM: 0.5768, Grad Norm: 3.3331, LR: 0.000019
Epoch 0, Iteration 16160, Loss: 4.9884, Loss AR: 4.5212, Loss CFM: 0.4671, Grad Norm: 3.9470, LR: 0.000019
Epoch 0, Iteration 16170, Loss: 4.9310, Loss AR: 4.4540, Loss CFM: 0.4770, Grad Norm: 3.1730, LR: 0.000019
Epoch 0, Iteration 16180, Loss: 5.1116, Loss AR: 4.5762, Loss CFM: 0.5355, Grad Norm: 4.5429, LR: 0.000019
Epoch 0, Iteration 16190, Loss: 4.9849, Loss AR: 4.3680, Loss CFM: 0.6170, Grad Norm: 3.7388, LR: 0.000019
Epoch 0, Iteration 16200, Loss: 5.0010, Loss AR: 4.4364, Loss CFM: 0.5646, Grad Norm: 3.5731, LR: 0.000019
Epoch 0, Iteration 16210, Loss: 4.9305, Loss AR: 4.4335, Loss CFM: 0.4969, Grad Norm: 3.1123, LR: 0.000019
Epoch 0, Iteration 16220, Loss: 5.0243, Loss AR: 4.5179, Loss CFM: 0.5064, Grad Norm: 3.2860, LR: 0.000019
Epoch 0, Iteration 16230, Loss: 5.1230, Loss AR: 4.6098, Loss CFM: 0.5132, Grad Norm: 5.0852, LR: 0.000019
Epoch 0, Iteration 16240, Loss: 5.0236, Loss AR: 4.4817, Loss CFM: 0.5418, Grad Norm: 3.9962, LR: 0.000019
Epoch 0, Iteration 16250, Loss: 4.9873, Loss AR: 4.3941, Loss CFM: 0.5932, Grad Norm: 4.1080, LR: 0.000019
Epoch 0, Iteration 16260, Loss: 5.2370, Loss AR: 4.6393, Loss CFM: 0.5977, Grad Norm: 5.2438, LR: 0.000019
Epoch 0, Iteration 16270, Loss: 4.7267, Loss AR: 4.1953, Loss CFM: 0.5314, Grad Norm: 3.3165, LR: 0.000019
Epoch 0, Iteration 16280, Loss: 5.0194, Loss AR: 4.4557, Loss CFM: 0.5637, Grad Norm: 4.4272, LR: 0.000019
Epoch 0, Iteration 16290, Loss: 5.1169, Loss AR: 4.5553, Loss CFM: 0.5616, Grad Norm: 3.8833, LR: 0.000019
Epoch 0, Iteration 16300, Loss: 4.8804, Loss AR: 4.4485, Loss CFM: 0.4320, Grad Norm: 7.0534, LR: 0.000019
Epoch 0, Iteration 16310, Loss: 4.9452, Loss AR: 4.3575, Loss CFM: 0.5877, Grad Norm: 2.6378, LR: 0.000019
Epoch 0, Iteration 16320, Loss: 5.2056, Loss AR: 4.7048, Loss CFM: 0.5008, Grad Norm: 3.0965, LR: 0.000019
Epoch 0, Iteration 16330, Loss: 5.1494, Loss AR: 4.6080, Loss CFM: 0.5413, Grad Norm: 3.3045, LR: 0.000019
Epoch 0, Iteration 16340, Loss: 5.1558, Loss AR: 4.6119, Loss CFM: 0.5440, Grad Norm: 2.8536, LR: 0.000019
Epoch 0, Iteration 16350, Loss: 4.9149, Loss AR: 4.3853, Loss CFM: 0.5295, Grad Norm: 4.1981, LR: 0.000019
Epoch 0, Iteration 16360, Loss: 4.9141, Loss AR: 4.3518, Loss CFM: 0.5623, Grad Norm: 4.0872, LR: 0.000019
Epoch 0, Iteration 16370, Loss: 4.9586, Loss AR: 4.3835, Loss CFM: 0.5751, Grad Norm: 3.3440, LR: 0.000019
Epoch 0, Iteration 16380, Loss: 5.2096, Loss AR: 4.6938, Loss CFM: 0.5158, Grad Norm: 5.2441, LR: 0.000019
Epoch 0, Iteration 16390, Loss: 5.4412, Loss AR: 4.9479, Loss CFM: 0.4933, Grad Norm: 4.6315, LR: 0.000019
Epoch 0, Iteration 16400, Loss: 4.9258, Loss AR: 4.3001, Loss CFM: 0.6257, Grad Norm: 5.3707, LR: 0.000019
Epoch 0, Iteration 16410, Loss: 5.0990, Loss AR: 4.5609, Loss CFM: 0.5381, Grad Norm: 12.6371, LR: 0.000019
Epoch 0, Iteration 16420, Loss: 4.8085, Loss AR: 4.2447, Loss CFM: 0.5638, Grad Norm: 4.4407, LR: 0.000019
Epoch 0, Iteration 16430, Loss: 5.1531, Loss AR: 4.5840, Loss CFM: 0.5692, Grad Norm: 4.5940, LR: 0.000019
Epoch 0, Iteration 16440, Loss: 5.1486, Loss AR: 4.6456, Loss CFM: 0.5029, Grad Norm: 4.6641, LR: 0.000019
Epoch 0, Iteration 16450, Loss: 5.2730, Loss AR: 4.7624, Loss CFM: 0.5106, Grad Norm: 3.1435, LR: 0.000019
Epoch 0, Iteration 16460, Loss: 4.8517, Loss AR: 4.2922, Loss CFM: 0.5595, Grad Norm: 4.9906, LR: 0.000019
Epoch 0, Iteration 16470, Loss: 5.3130, Loss AR: 4.7095, Loss CFM: 0.6034, Grad Norm: 4.8672, LR: 0.000019
Epoch 0, Iteration 16480, Loss: 5.0965, Loss AR: 4.6203, Loss CFM: 0.4762, Grad Norm: 4.6001, LR: 0.000019
Epoch 0, Iteration 16490, Loss: 4.9699, Loss AR: 4.4282, Loss CFM: 0.5417, Grad Norm: 4.1996, LR: 0.000019
Epoch 0, Iteration 16500, Loss: 4.9296, Loss AR: 4.3851, Loss CFM: 0.5445, Grad Norm: 4.1151, LR: 0.000019
Epoch 0, Iteration 16510, Loss: 4.9506, Loss AR: 4.4021, Loss CFM: 0.5484, Grad Norm: 3.7493, LR: 0.000019
Epoch 0, Iteration 16520, Loss: 5.0105, Loss AR: 4.4458, Loss CFM: 0.5647, Grad Norm: 4.7187, LR: 0.000019
Epoch 0, Iteration 16530, Loss: 5.1101, Loss AR: 4.5734, Loss CFM: 0.5367, Grad Norm: 4.4695, LR: 0.000019
Epoch 0, Iteration 16540, Loss: 4.8843, Loss AR: 4.3364, Loss CFM: 0.5479, Grad Norm: 2.7422, LR: 0.000019
Epoch 0, Iteration 16550, Loss: 5.2603, Loss AR: 4.5584, Loss CFM: 0.7019, Grad Norm: 3.4352, LR: 0.000019
Epoch 0, Iteration 16560, Loss: 5.1358, Loss AR: 4.5950, Loss CFM: 0.5407, Grad Norm: 3.5888, LR: 0.000019
Epoch 0, Iteration 16570, Loss: 5.0394, Loss AR: 4.5214, Loss CFM: 0.5180, Grad Norm: 4.1251, LR: 0.000019
Epoch 0, Iteration 16580, Loss: 5.2208, Loss AR: 4.6112, Loss CFM: 0.6097, Grad Norm: 4.9316, LR: 0.000019
Epoch 0, Iteration 16590, Loss: 5.1126, Loss AR: 4.5139, Loss CFM: 0.5987, Grad Norm: 4.2724, LR: 0.000019
Epoch 0, Iteration 16600, Loss: 5.0936, Loss AR: 4.5546, Loss CFM: 0.5390, Grad Norm: 5.0670, LR: 0.000019
Epoch 0, Iteration 16610, Loss: 5.1813, Loss AR: 4.6578, Loss CFM: 0.5235, Grad Norm: 4.3464, LR: 0.000019
Epoch 0, Iteration 16620, Loss: 5.0207, Loss AR: 4.4779, Loss CFM: 0.5428, Grad Norm: 3.4030, LR: 0.000019
Epoch 0, Iteration 16630, Loss: 5.1338, Loss AR: 4.5597, Loss CFM: 0.5741, Grad Norm: 5.0128, LR: 0.000019
Epoch 0, Iteration 16640, Loss: 5.0514, Loss AR: 4.4885, Loss CFM: 0.5628, Grad Norm: 3.3430, LR: 0.000019
Epoch 0, Iteration 16650, Loss: 4.8632, Loss AR: 4.3447, Loss CFM: 0.5185, Grad Norm: 3.9336, LR: 0.000019
Epoch 0, Iteration 16660, Loss: 5.0787, Loss AR: 4.5175, Loss CFM: 0.5613, Grad Norm: 3.1030, LR: 0.000019
Epoch 0, Iteration 16670, Loss: 5.1139, Loss AR: 4.5889, Loss CFM: 0.5250, Grad Norm: 4.7868, LR: 0.000019
Epoch 0, Iteration 16680, Loss: 5.3603, Loss AR: 4.7575, Loss CFM: 0.6028, Grad Norm: 4.3509, LR: 0.000019
Epoch 0, Iteration 16690, Loss: 5.3701, Loss AR: 4.7440, Loss CFM: 0.6262, Grad Norm: 5.1348, LR: 0.000019
Epoch 0, Iteration 16700, Loss: 4.9518, Loss AR: 4.3697, Loss CFM: 0.5820, Grad Norm: 4.1729, LR: 0.000019
Epoch 0, Iteration 16710, Loss: 5.3359, Loss AR: 4.7712, Loss CFM: 0.5648, Grad Norm: 2.9560, LR: 0.000019
Epoch 0, Iteration 16720, Loss: 5.1293, Loss AR: 4.6145, Loss CFM: 0.5148, Grad Norm: 4.2436, LR: 0.000019
Epoch 0, Iteration 16730, Loss: 4.8934, Loss AR: 4.3694, Loss CFM: 0.5240, Grad Norm: 4.4071, LR: 0.000019
Epoch 0, Iteration 16740, Loss: 5.3151, Loss AR: 4.6960, Loss CFM: 0.6190, Grad Norm: 4.7239, LR: 0.000019
Epoch 0, Iteration 16750, Loss: 4.9045, Loss AR: 4.4652, Loss CFM: 0.4393, Grad Norm: 3.4756, LR: 0.000019
Epoch 0, Iteration 16760, Loss: 5.3440, Loss AR: 4.8273, Loss CFM: 0.5168, Grad Norm: 3.9407, LR: 0.000019
Epoch 0, Iteration 16770, Loss: 5.1404, Loss AR: 4.6047, Loss CFM: 0.5356, Grad Norm: 6.4449, LR: 0.000019
Epoch 0, Iteration 16780, Loss: 5.0659, Loss AR: 4.4127, Loss CFM: 0.6532, Grad Norm: 3.5133, LR: 0.000019
Epoch 0, Iteration 16790, Loss: 5.1784, Loss AR: 4.5915, Loss CFM: 0.5869, Grad Norm: 3.8536, LR: 0.000019
Epoch 0, Iteration 16800, Loss: 5.0248, Loss AR: 4.4688, Loss CFM: 0.5559, Grad Norm: 2.9375, LR: 0.000019
Epoch 0, Iteration 16810, Loss: 5.0933, Loss AR: 4.4786, Loss CFM: 0.6147, Grad Norm: 3.5358, LR: 0.000019
Epoch 0, Iteration 16820, Loss: 5.0480, Loss AR: 4.5546, Loss CFM: 0.4934, Grad Norm: 3.7649, LR: 0.000019
Epoch 0, Iteration 16830, Loss: 4.8699, Loss AR: 4.3631, Loss CFM: 0.5068, Grad Norm: 4.3321, LR: 0.000019
Epoch 0, Iteration 16840, Loss: 5.1585, Loss AR: 4.6265, Loss CFM: 0.5320, Grad Norm: 3.8948, LR: 0.000019
Epoch 0, Iteration 16850, Loss: 5.2019, Loss AR: 4.6468, Loss CFM: 0.5551, Grad Norm: 6.2975, LR: 0.000019
Epoch 0, Iteration 16860, Loss: 5.2422, Loss AR: 4.6724, Loss CFM: 0.5698, Grad Norm: 3.9706, LR: 0.000019
Epoch 0, Iteration 16870, Loss: 5.2294, Loss AR: 4.6196, Loss CFM: 0.6098, Grad Norm: 4.4756, LR: 0.000019
Epoch 0, Iteration 16880, Loss: 5.0684, Loss AR: 4.4845, Loss CFM: 0.5839, Grad Norm: 3.1383, LR: 0.000019
Epoch 0, Iteration 16890, Loss: 4.9227, Loss AR: 4.3636, Loss CFM: 0.5591, Grad Norm: 3.3142, LR: 0.000019
Epoch 0, Iteration 16900, Loss: 4.9770, Loss AR: 4.4086, Loss CFM: 0.5684, Grad Norm: 4.2232, LR: 0.000019
Epoch 0, Iteration 16910, Loss: 5.1055, Loss AR: 4.5769, Loss CFM: 0.5285, Grad Norm: 3.4814, LR: 0.000019
Epoch 0, Iteration 16920, Loss: 5.0829, Loss AR: 4.5401, Loss CFM: 0.5429, Grad Norm: 4.2754, LR: 0.000019
Epoch 0, Iteration 16930, Loss: 4.9589, Loss AR: 4.4559, Loss CFM: 0.5030, Grad Norm: 4.6263, LR: 0.000019
Epoch 0, Iteration 16940, Loss: 5.1258, Loss AR: 4.5013, Loss CFM: 0.6244, Grad Norm: 4.0650, LR: 0.000019
Epoch 0, Iteration 16950, Loss: 4.9616, Loss AR: 4.3510, Loss CFM: 0.6107, Grad Norm: 3.5115, LR: 0.000019
Epoch 0, Iteration 16960, Loss: 5.0009, Loss AR: 4.4733, Loss CFM: 0.5277, Grad Norm: 3.7213, LR: 0.000019
Epoch 0, Iteration 16970, Loss: 5.2431, Loss AR: 4.7503, Loss CFM: 0.4928, Grad Norm: 5.8027, LR: 0.000019
Epoch 0, Iteration 16980, Loss: 5.2099, Loss AR: 4.6211, Loss CFM: 0.5888, Grad Norm: 4.7248, LR: 0.000019
Epoch 0, Iteration 16990, Loss: 5.1110, Loss AR: 4.4728, Loss CFM: 0.6382, Grad Norm: 4.2773, LR: 0.000019
Epoch 0, Iteration 17000, Loss: 4.8662, Loss AR: 4.3269, Loss CFM: 0.5392, Grad Norm: 3.6796, LR: 0.000019
Epoch 0, Iteration 17010, Loss: 4.9645, Loss AR: 4.4170, Loss CFM: 0.5475, Grad Norm: 3.9397, LR: 0.000019
Epoch 0, Iteration 17020, Loss: 5.0406, Loss AR: 4.4376, Loss CFM: 0.6031, Grad Norm: 3.4997, LR: 0.000019
Epoch 0, Iteration 17030, Loss: 4.9620, Loss AR: 4.4186, Loss CFM: 0.5434, Grad Norm: 3.3881, LR: 0.000019
Epoch 0, Iteration 17040, Loss: 5.3420, Loss AR: 4.5487, Loss CFM: 0.7933, Grad Norm: 3.9592, LR: 0.000019
Epoch 0, Iteration 17050, Loss: 5.0897, Loss AR: 4.4697, Loss CFM: 0.6200, Grad Norm: 4.0845, LR: 0.000019
Epoch 0, Iteration 17060, Loss: 5.0078, Loss AR: 4.4911, Loss CFM: 0.5168, Grad Norm: 4.3814, LR: 0.000019
Epoch 0, Iteration 17070, Loss: 5.0475, Loss AR: 4.4821, Loss CFM: 0.5653, Grad Norm: 3.6047, LR: 0.000019
Epoch 0, Iteration 17080, Loss: 5.1488, Loss AR: 4.5683, Loss CFM: 0.5804, Grad Norm: 4.2440, LR: 0.000019
Epoch 0, Iteration 17090, Loss: 5.1374, Loss AR: 4.5453, Loss CFM: 0.5921, Grad Norm: 5.5194, LR: 0.000019
Epoch 0, Iteration 17100, Loss: 5.1248, Loss AR: 4.6499, Loss CFM: 0.4749, Grad Norm: 4.0571, LR: 0.000019
Epoch 0, Iteration 17110, Loss: 5.2186, Loss AR: 4.6846, Loss CFM: 0.5341, Grad Norm: 4.7940, LR: 0.000019
Epoch 0, Iteration 17120, Loss: 5.2333, Loss AR: 4.6487, Loss CFM: 0.5846, Grad Norm: 4.8780, LR: 0.000019
Epoch 0, Iteration 17130, Loss: 5.0279, Loss AR: 4.4428, Loss CFM: 0.5851, Grad Norm: 3.9288, LR: 0.000019
Epoch 0, Iteration 17140, Loss: 5.1218, Loss AR: 4.5932, Loss CFM: 0.5286, Grad Norm: 4.6982, LR: 0.000019
Epoch 0, Iteration 17150, Loss: 5.1670, Loss AR: 4.6694, Loss CFM: 0.4976, Grad Norm: 2.9231, LR: 0.000019
Epoch 0, Iteration 17160, Loss: 4.8567, Loss AR: 4.3433, Loss CFM: 0.5134, Grad Norm: 3.2460, LR: 0.000019
Epoch 0, Iteration 17170, Loss: 5.1046, Loss AR: 4.5833, Loss CFM: 0.5213, Grad Norm: 4.0430, LR: 0.000019
Epoch 0, Iteration 17180, Loss: 5.7749, Loss AR: 5.2258, Loss CFM: 0.5491, Grad Norm: 4.7535, LR: 0.000019
Epoch 0, Iteration 17190, Loss: 5.0396, Loss AR: 4.4859, Loss CFM: 0.5538, Grad Norm: 3.8670, LR: 0.000019
Epoch 0, Iteration 17200, Loss: 5.0745, Loss AR: 4.5115, Loss CFM: 0.5630, Grad Norm: 3.8809, LR: 0.000019
Epoch 0, Iteration 17210, Loss: 4.8547, Loss AR: 4.3008, Loss CFM: 0.5539, Grad Norm: 3.2091, LR: 0.000019
Epoch 0, Iteration 17220, Loss: 4.9253, Loss AR: 4.3669, Loss CFM: 0.5583, Grad Norm: 3.5272, LR: 0.000019
Epoch 0, Iteration 17230, Loss: 5.4903, Loss AR: 4.9215, Loss CFM: 0.5688, Grad Norm: 3.8476, LR: 0.000019
Epoch 0, Iteration 17240, Loss: 5.0417, Loss AR: 4.4647, Loss CFM: 0.5769, Grad Norm: 3.9583, LR: 0.000019
Epoch 0, Iteration 17250, Loss: 5.0380, Loss AR: 4.4640, Loss CFM: 0.5741, Grad Norm: 4.2660, LR: 0.000019
Epoch 0, Iteration 17260, Loss: 5.0762, Loss AR: 4.5251, Loss CFM: 0.5511, Grad Norm: 3.7894, LR: 0.000019
Epoch 0, Iteration 17270, Loss: 4.9641, Loss AR: 4.4383, Loss CFM: 0.5258, Grad Norm: 3.7038, LR: 0.000019
Epoch 0, Iteration 17280, Loss: 5.0643, Loss AR: 4.5218, Loss CFM: 0.5425, Grad Norm: 4.1938, LR: 0.000019
Epoch 0, Iteration 17290, Loss: 5.0019, Loss AR: 4.4478, Loss CFM: 0.5542, Grad Norm: 4.0777, LR: 0.000019
Epoch 0, Iteration 17300, Loss: 5.3076, Loss AR: 4.7033, Loss CFM: 0.6043, Grad Norm: 3.7056, LR: 0.000019
Epoch 0, Iteration 17310, Loss: 4.9657, Loss AR: 4.4104, Loss CFM: 0.5553, Grad Norm: 3.0296, LR: 0.000019
Epoch 0, Iteration 17320, Loss: 5.2822, Loss AR: 4.7254, Loss CFM: 0.5568, Grad Norm: 3.9292, LR: 0.000019
Epoch 0, Iteration 17330, Loss: 4.9941, Loss AR: 4.4213, Loss CFM: 0.5728, Grad Norm: 4.7785, LR: 0.000019
Epoch 0, Iteration 17340, Loss: 4.9212, Loss AR: 4.3646, Loss CFM: 0.5566, Grad Norm: 4.2151, LR: 0.000019
Epoch 0, Iteration 17350, Loss: 5.1111, Loss AR: 4.5402, Loss CFM: 0.5709, Grad Norm: 3.1583, LR: 0.000019
Epoch 0, Iteration 17360, Loss: 5.0203, Loss AR: 4.4753, Loss CFM: 0.5450, Grad Norm: 4.6734, LR: 0.000019
Epoch 0, Iteration 17370, Loss: 4.9631, Loss AR: 4.4807, Loss CFM: 0.4824, Grad Norm: 4.1201, LR: 0.000019
Epoch 0, Iteration 17380, Loss: 4.9592, Loss AR: 4.3928, Loss CFM: 0.5664, Grad Norm: 4.0650, LR: 0.000019
Epoch 0, Iteration 17390, Loss: 5.0005, Loss AR: 4.4566, Loss CFM: 0.5438, Grad Norm: 3.7993, LR: 0.000019
Epoch 0, Iteration 17400, Loss: 5.1344, Loss AR: 4.5943, Loss CFM: 0.5402, Grad Norm: 4.2788, LR: 0.000019
Epoch 0, Iteration 17410, Loss: 4.8890, Loss AR: 4.3177, Loss CFM: 0.5713, Grad Norm: 4.4898, LR: 0.000019
Epoch 0, Iteration 17420, Loss: 5.1085, Loss AR: 4.5809, Loss CFM: 0.5276, Grad Norm: 3.6258, LR: 0.000019
Epoch 0, Iteration 17430, Loss: 5.0292, Loss AR: 4.4801, Loss CFM: 0.5491, Grad Norm: 3.8810, LR: 0.000019
Epoch 0, Iteration 17440, Loss: 5.2917, Loss AR: 4.6657, Loss CFM: 0.6261, Grad Norm: 5.6506, LR: 0.000019
Epoch 0, Iteration 17450, Loss: 5.1879, Loss AR: 4.6404, Loss CFM: 0.5475, Grad Norm: 4.3522, LR: 0.000019
Epoch 0, Iteration 17460, Loss: 5.1284, Loss AR: 4.5799, Loss CFM: 0.5484, Grad Norm: 4.4681, LR: 0.000019
Epoch 0, Iteration 17470, Loss: 5.0434, Loss AR: 4.5219, Loss CFM: 0.5215, Grad Norm: 3.3353, LR: 0.000019
Epoch 0, Iteration 17480, Loss: 5.1682, Loss AR: 4.6073, Loss CFM: 0.5609, Grad Norm: 3.4753, LR: 0.000019
Epoch 0, Iteration 17490, Loss: 5.0771, Loss AR: 4.5649, Loss CFM: 0.5122, Grad Norm: 3.3265, LR: 0.000019
Epoch 0, Iteration 17500, Loss: 4.8066, Loss AR: 4.3118, Loss CFM: 0.4948, Grad Norm: 4.4078, LR: 0.000019
Epoch 0, Iteration 17510, Loss: 5.0682, Loss AR: 4.5074, Loss CFM: 0.5608, Grad Norm: 3.0037, LR: 0.000019
Epoch 0, Iteration 17520, Loss: 5.0631, Loss AR: 4.5622, Loss CFM: 0.5009, Grad Norm: 3.9460, LR: 0.000019
Epoch 0, Iteration 17530, Loss: 5.0030, Loss AR: 4.5203, Loss CFM: 0.4827, Grad Norm: 5.1968, LR: 0.000019
Epoch 0, Iteration 17540, Loss: 5.1437, Loss AR: 4.5735, Loss CFM: 0.5702, Grad Norm: 4.1178, LR: 0.000019
Epoch 0, Iteration 17550, Loss: 5.0767, Loss AR: 4.5702, Loss CFM: 0.5065, Grad Norm: 5.0893, LR: 0.000019
Epoch 0, Iteration 17560, Loss: 4.9071, Loss AR: 4.3864, Loss CFM: 0.5206, Grad Norm: 4.0398, LR: 0.000019
Epoch 0, Iteration 17570, Loss: 4.9069, Loss AR: 4.3902, Loss CFM: 0.5166, Grad Norm: 5.2248, LR: 0.000019
Epoch 0, Iteration 17580, Loss: 5.0353, Loss AR: 4.4270, Loss CFM: 0.6083, Grad Norm: 3.9286, LR: 0.000019
Epoch 0, Iteration 17590, Loss: 5.3732, Loss AR: 4.7842, Loss CFM: 0.5890, Grad Norm: 6.2227, LR: 0.000019
Epoch 0, Iteration 17600, Loss: 4.9155, Loss AR: 4.4152, Loss CFM: 0.5004, Grad Norm: 3.8358, LR: 0.000019
Epoch 0, Iteration 17610, Loss: 5.1475, Loss AR: 4.5711, Loss CFM: 0.5763, Grad Norm: 5.5550, LR: 0.000019
Epoch 0, Iteration 17620, Loss: 4.8984, Loss AR: 4.2726, Loss CFM: 0.6259, Grad Norm: 4.1643, LR: 0.000019
Epoch 0, Iteration 17630, Loss: 5.0860, Loss AR: 4.5917, Loss CFM: 0.4943, Grad Norm: 4.4554, LR: 0.000019
Epoch 0, Iteration 17640, Loss: 5.1493, Loss AR: 4.6411, Loss CFM: 0.5082, Grad Norm: 4.8703, LR: 0.000019
Epoch 0, Iteration 17650, Loss: 5.2286, Loss AR: 4.6742, Loss CFM: 0.5544, Grad Norm: 4.1157, LR: 0.000019
Epoch 0, Iteration 17660, Loss: 5.0800, Loss AR: 4.5307, Loss CFM: 0.5492, Grad Norm: 4.0226, LR: 0.000019
Epoch 0, Iteration 17670, Loss: 4.9295, Loss AR: 4.3827, Loss CFM: 0.5467, Grad Norm: 3.4845, LR: 0.000019
Epoch 0, Iteration 17680, Loss: 5.0965, Loss AR: 4.5037, Loss CFM: 0.5928, Grad Norm: 3.2388, LR: 0.000019
Epoch 0, Iteration 17690, Loss: 5.2950, Loss AR: 4.7938, Loss CFM: 0.5012, Grad Norm: 5.0492, LR: 0.000019