{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:49:23.131069', 'step': 0, 'epoch': 0} {'type': 'pplx', 'content': 107.54594258685516, 'timestamp': '2025-09-05 08:49:23.133261', 'step': 0, 'epoch': 0} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:23.415457', 'step': 0, 'epoch': 1} {'type': 'loss', 'content': 0.6169126629829407, 'timestamp': '2025-09-05 08:49:23.417206', 'step': 1, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:49:23.625279', 'step': 1, 'epoch': 1} {'type': 'loss', 'content': 0.7517166137695312, 'timestamp': '2025-09-05 08:49:23.627202', 'step': 2, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:49:23.802422', 'step': 2, 'epoch': 1} {'type': 'loss', 'content': 0.6235635280609131, 'timestamp': '2025-09-05 08:49:23.804678', 'step': 3, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:23.985359', 'step': 3, 'epoch': 1} {'type': 'loss', 'content': 0.8837859630584717, 'timestamp': '2025-09-05 08:49:24.194974', 'step': 4, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:24.346141', 'step': 4, 'epoch': 1} {'type': 'loss', 'content': 0.643877387046814, 'timestamp': '2025-09-05 08:49:24.348074', 'step': 5, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:24.522959', 'step': 5, 'epoch': 1} {'type': 'loss', 'content': 0.7255116105079651, 'timestamp': '2025-09-05 08:49:24.524779', 'step': 6, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:24.695679', 'step': 6, 'epoch': 1} {'type': 'loss', 'content': 0.5929327607154846, 'timestamp': '2025-09-05 08:49:24.697536', 'step': 7, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:24.876445', 'step': 7, 'epoch': 1} {'type': 'loss', 'content': 0.6998869776725769, 'timestamp': '2025-09-05 08:49:24.892849', 'step': 8, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:25.062476', 'step': 8, 'epoch': 1} {'type': 'loss', 'content': 0.7311207056045532, 'timestamp': '2025-09-05 08:49:25.064482', 'step': 9, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:25.235009', 'step': 9, 'epoch': 1} {'type': 'loss', 'content': 0.6596672534942627, 'timestamp': '2025-09-05 08:49:25.238640', 'step': 10, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:25.414055', 'step': 10, 'epoch': 1} {'type': 'loss', 'content': 0.8627897500991821, 'timestamp': '2025-09-05 08:49:25.416048', 'step': 11, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:25.585764', 'step': 11, 'epoch': 1} {'type': 'loss', 'content': 0.7733902335166931, 'timestamp': '2025-09-05 08:49:25.602455', 'step': 12, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:25.771572', 'step': 12, 'epoch': 1} {'type': 'loss', 'content': 0.6929965615272522, 'timestamp': '2025-09-05 08:49:25.773466', 'step': 13, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:25.950489', 'step': 13, 'epoch': 1} {'type': 'loss', 'content': 0.622175395488739, 'timestamp': '2025-09-05 08:49:25.952905', 'step': 14, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:26.125306', 'step': 14, 'epoch': 1} {'type': 'loss', 'content': 0.5498591065406799, 'timestamp': '2025-09-05 08:49:26.127215', 'step': 15, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:49:26.296529', 'step': 15, 'epoch': 1} {'type': 'loss', 'content': 0.5476340651512146, 'timestamp': '2025-09-05 08:49:26.312315', 'step': 16, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:26.474267', 'step': 16, 'epoch': 1} {'type': 'loss', 'content': 0.592532217502594, 'timestamp': '2025-09-05 08:49:26.476179', 'step': 17, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:26.645322', 'step': 17, 'epoch': 1} {'type': 'loss', 'content': 0.624656617641449, 'timestamp': '2025-09-05 08:49:26.647578', 'step': 18, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:49:26.819386', 'step': 18, 'epoch': 1} {'type': 'loss', 'content': 0.6042789816856384, 'timestamp': '2025-09-05 08:49:26.821212', 'step': 19, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:26.992800', 'step': 19, 'epoch': 1} {'type': 'loss', 'content': 0.54128497838974, 'timestamp': '2025-09-05 08:49:27.006809', 'step': 20, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:49:31.633614', 'step': 20, 'epoch': 1} {'type': 'pplx', 'content': 98.93025738022824, 'timestamp': '2025-09-05 08:49:31.636504', 'step': 20, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:31.771651', 'step': 20, 'epoch': 1} {'type': 'loss', 'content': 0.6219188570976257, 'timestamp': '2025-09-05 08:49:31.773972', 'step': 21, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:49:31.945046', 'step': 21, 'epoch': 1} {'type': 'loss', 'content': 0.6864243745803833, 'timestamp': '2025-09-05 08:49:31.947077', 'step': 22, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:32.119341', 'step': 22, 'epoch': 1} {'type': 'loss', 'content': 0.46079257130622864, 'timestamp': '2025-09-05 08:49:32.121160', 'step': 23, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:49:32.292480', 'step': 23, 'epoch': 1} {'type': 'loss', 'content': 0.6522855162620544, 'timestamp': '2025-09-05 08:49:32.307856', 'step': 24, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:32.472758', 'step': 24, 'epoch': 1} {'type': 'loss', 'content': 0.4554615318775177, 'timestamp': '2025-09-05 08:49:32.474764', 'step': 25, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:32.643264', 'step': 25, 'epoch': 1} {'type': 'loss', 'content': 0.530107319355011, 'timestamp': '2025-09-05 08:49:32.645124', 'step': 26, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:32.823718', 'step': 26, 'epoch': 1} {'type': 'loss', 'content': 0.5458198189735413, 'timestamp': '2025-09-05 08:49:32.825511', 'step': 27, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:32.995958', 'step': 27, 'epoch': 1} {'type': 'loss', 'content': 0.6185097098350525, 'timestamp': '2025-09-05 08:49:33.012305', 'step': 28, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:33.181153', 'step': 28, 'epoch': 1} {'type': 'loss', 'content': 0.48863735795021057, 'timestamp': '2025-09-05 08:49:33.183739', 'step': 29, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:33.360817', 'step': 29, 'epoch': 1} {'type': 'loss', 'content': 0.4185200035572052, 'timestamp': '2025-09-05 08:49:33.362640', 'step': 30, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:49:33.532666', 'step': 30, 'epoch': 1} {'type': 'loss', 'content': 0.5862270593643188, 'timestamp': '2025-09-05 08:49:33.534345', 'step': 31, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:49:33.709825', 'step': 31, 'epoch': 1} {'type': 'loss', 'content': 0.5197166800498962, 'timestamp': '2025-09-05 08:49:33.724694', 'step': 32, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:33.893918', 'step': 32, 'epoch': 1} {'type': 'loss', 'content': 0.4848283529281616, 'timestamp': '2025-09-05 08:49:33.897285', 'step': 33, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:34.070516', 'step': 33, 'epoch': 1} {'type': 'loss', 'content': 0.48005831241607666, 'timestamp': '2025-09-05 08:49:34.072556', 'step': 34, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:34.244353', 'step': 34, 'epoch': 1} {'type': 'loss', 'content': 0.5067130923271179, 'timestamp': '2025-09-05 08:49:34.246260', 'step': 35, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:34.423602', 'step': 35, 'epoch': 1} {'type': 'loss', 'content': 0.5473061800003052, 'timestamp': '2025-09-05 08:49:34.438717', 'step': 36, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:34.608695', 'step': 36, 'epoch': 1} {'type': 'loss', 'content': 0.5381602048873901, 'timestamp': '2025-09-05 08:49:34.610450', 'step': 37, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:34.781445', 'step': 37, 'epoch': 1} {'type': 'loss', 'content': 0.53970867395401, 'timestamp': '2025-09-05 08:49:34.783221', 'step': 38, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:34.961725', 'step': 38, 'epoch': 1} {'type': 'loss', 'content': 0.5283556580543518, 'timestamp': '2025-09-05 08:49:34.963772', 'step': 39, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:49:35.140557', 'step': 39, 'epoch': 1} {'type': 'loss', 'content': 0.4740239679813385, 'timestamp': '2025-09-05 08:49:35.154682', 'step': 40, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:49:39.785388', 'step': 40, 'epoch': 1} {'type': 'pplx', 'content': 87.66840919832546, 'timestamp': '2025-09-05 08:49:39.787493', 'step': 40, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 40', 'timestamp': '2025-09-05 08:49:40.264753', 'step': 40, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:40.437738', 'step': 40, 'epoch': 1} {'type': 'loss', 'content': 0.48637235164642334, 'timestamp': '2025-09-05 08:49:40.439854', 'step': 41, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:40.645274', 'step': 41, 'epoch': 1} {'type': 'loss', 'content': 0.5245602130889893, 'timestamp': '2025-09-05 08:49:40.647013', 'step': 42, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:40.852808', 'step': 42, 'epoch': 1} {'type': 'loss', 'content': 0.4956504702568054, 'timestamp': '2025-09-05 08:49:40.854709', 'step': 43, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:41.052545', 'step': 43, 'epoch': 1} {'type': 'loss', 'content': 0.49369287490844727, 'timestamp': '2025-09-05 08:49:41.069275', 'step': 44, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:41.266843', 'step': 44, 'epoch': 1} {'type': 'loss', 'content': 0.5377260446548462, 'timestamp': '2025-09-05 08:49:41.268933', 'step': 45, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:41.467042', 'step': 45, 'epoch': 1} {'type': 'loss', 'content': 0.3377070426940918, 'timestamp': '2025-09-05 08:49:41.468889', 'step': 46, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:41.666652', 'step': 46, 'epoch': 1} {'type': 'loss', 'content': 0.49804773926734924, 'timestamp': '2025-09-05 08:49:41.668696', 'step': 47, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:41.866243', 'step': 47, 'epoch': 1} {'type': 'loss', 'content': 0.5739774703979492, 'timestamp': '2025-09-05 08:49:41.881595', 'step': 48, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:49:42.080545', 'step': 48, 'epoch': 1} {'type': 'loss', 'content': 0.44619235396385193, 'timestamp': '2025-09-05 08:49:42.082562', 'step': 49, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:49:42.287133', 'step': 49, 'epoch': 1} {'type': 'loss', 'content': 0.5007292032241821, 'timestamp': '2025-09-05 08:49:42.289340', 'step': 50, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:42.490217', 'step': 50, 'epoch': 1} {'type': 'loss', 'content': 0.47838714718818665, 'timestamp': '2025-09-05 08:49:42.492212', 'step': 51, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:42.699436', 'step': 51, 'epoch': 1} {'type': 'loss', 'content': 0.5947479605674744, 'timestamp': '2025-09-05 08:49:42.714002', 'step': 52, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:42.904233', 'step': 52, 'epoch': 1} {'type': 'loss', 'content': 0.4078122675418854, 'timestamp': '2025-09-05 08:49:42.906345', 'step': 53, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:49:43.103998', 'step': 53, 'epoch': 1} {'type': 'loss', 'content': 0.3910931646823883, 'timestamp': '2025-09-05 08:49:43.106013', 'step': 54, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:49:43.306892', 'step': 54, 'epoch': 1} {'type': 'loss', 'content': 0.5116702318191528, 'timestamp': '2025-09-05 08:49:43.308847', 'step': 55, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:43.506454', 'step': 55, 'epoch': 1} {'type': 'loss', 'content': 0.40538397431373596, 'timestamp': '2025-09-05 08:49:43.523075', 'step': 56, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:43.720676', 'step': 56, 'epoch': 1} {'type': 'loss', 'content': 0.3850765526294708, 'timestamp': '2025-09-05 08:49:43.722821', 'step': 57, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:43.919447', 'step': 57, 'epoch': 1} {'type': 'loss', 'content': 0.5028924345970154, 'timestamp': '2025-09-05 08:49:43.922267', 'step': 58, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:49:44.120144', 'step': 58, 'epoch': 1} {'type': 'loss', 'content': 0.5674346089363098, 'timestamp': '2025-09-05 08:49:44.122315', 'step': 59, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:44.318793', 'step': 59, 'epoch': 1} {'type': 'loss', 'content': 0.4544109106063843, 'timestamp': '2025-09-05 08:49:44.332801', 'step': 60, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:49:48.947455', 'step': 60, 'epoch': 1} {'type': 'pplx', 'content': 80.36248908828922, 'timestamp': '2025-09-05 08:49:48.949747', 'step': 60, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:49.112997', 'step': 60, 'epoch': 1} {'type': 'loss', 'content': 0.4171891510486603, 'timestamp': '2025-09-05 08:49:49.115100', 'step': 61, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:49.283044', 'step': 61, 'epoch': 1} {'type': 'loss', 'content': 0.32811057567596436, 'timestamp': '2025-09-05 08:49:49.285337', 'step': 62, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:49:49.491105', 'step': 62, 'epoch': 1} {'type': 'loss', 'content': 0.5731877088546753, 'timestamp': '2025-09-05 08:49:49.493306', 'step': 63, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:49.692196', 'step': 63, 'epoch': 1} {'type': 'loss', 'content': 0.49480125308036804, 'timestamp': '2025-09-05 08:49:49.708875', 'step': 64, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:49:49.906046', 'step': 64, 'epoch': 1} {'type': 'loss', 'content': 0.5468859076499939, 'timestamp': '2025-09-05 08:49:49.908063', 'step': 65, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:50.105533', 'step': 65, 'epoch': 1} {'type': 'loss', 'content': 0.40803736448287964, 'timestamp': '2025-09-05 08:49:50.107576', 'step': 66, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:50.303363', 'step': 66, 'epoch': 1} {'type': 'loss', 'content': 0.5299073457717896, 'timestamp': '2025-09-05 08:49:50.306069', 'step': 67, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:50.506081', 'step': 67, 'epoch': 1} {'type': 'loss', 'content': 0.42571789026260376, 'timestamp': '2025-09-05 08:49:50.526032', 'step': 68, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:50.716930', 'step': 68, 'epoch': 1} {'type': 'loss', 'content': 0.4201086461544037, 'timestamp': '2025-09-05 08:49:50.718937', 'step': 69, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:50.916075', 'step': 69, 'epoch': 1} {'type': 'loss', 'content': 0.4884677231311798, 'timestamp': '2025-09-05 08:49:50.918031', 'step': 70, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:49:51.116971', 'step': 70, 'epoch': 1} {'type': 'loss', 'content': 0.41253289580345154, 'timestamp': '2025-09-05 08:49:51.118932', 'step': 71, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:51.326776', 'step': 71, 'epoch': 1} {'type': 'loss', 'content': 0.45812252163887024, 'timestamp': '2025-09-05 08:49:51.341164', 'step': 72, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:49:51.538100', 'step': 72, 'epoch': 1} {'type': 'loss', 'content': 0.5628377795219421, 'timestamp': '2025-09-05 08:49:51.540135', 'step': 73, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:49:51.742012', 'step': 73, 'epoch': 1} {'type': 'loss', 'content': 0.5147489905357361, 'timestamp': '2025-09-05 08:49:51.743976', 'step': 74, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:49:51.950073', 'step': 74, 'epoch': 1} {'type': 'loss', 'content': 0.36594030261039734, 'timestamp': '2025-09-05 08:49:51.952199', 'step': 75, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:52.149922', 'step': 75, 'epoch': 1} {'type': 'loss', 'content': 0.4461878538131714, 'timestamp': '2025-09-05 08:49:52.164170', 'step': 76, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:52.353769', 'step': 76, 'epoch': 1} {'type': 'loss', 'content': 0.4893703758716583, 'timestamp': '2025-09-05 08:49:52.355812', 'step': 77, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:49:52.552180', 'step': 77, 'epoch': 1} {'type': 'loss', 'content': 0.4122447967529297, 'timestamp': '2025-09-05 08:49:52.554284', 'step': 78, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:52.762974', 'step': 78, 'epoch': 1} {'type': 'loss', 'content': 0.37286263704299927, 'timestamp': '2025-09-05 08:49:52.764953', 'step': 79, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:49:52.959916', 'step': 79, 'epoch': 1} {'type': 'loss', 'content': 0.32313352823257446, 'timestamp': '2025-09-05 08:49:52.974096', 'step': 80, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:49:57.618632', 'step': 80, 'epoch': 1} {'type': 'pplx', 'content': 75.94659559691486, 'timestamp': '2025-09-05 08:49:57.620939', 'step': 80, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 80', 'timestamp': '2025-09-05 08:49:58.082937', 'step': 80, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:58.253682', 'step': 80, 'epoch': 1} {'type': 'loss', 'content': 0.4822365939617157, 'timestamp': '2025-09-05 08:49:58.255959', 'step': 81, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:58.453126', 'step': 81, 'epoch': 1} {'type': 'loss', 'content': 0.41527748107910156, 'timestamp': '2025-09-05 08:49:58.455406', 'step': 82, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:49:58.651029', 'step': 82, 'epoch': 1} {'type': 'loss', 'content': 0.49072203040122986, 'timestamp': '2025-09-05 08:49:58.652972', 'step': 83, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:49:58.852717', 'step': 83, 'epoch': 1} {'type': 'loss', 'content': 0.48262521624565125, 'timestamp': '2025-09-05 08:49:58.867136', 'step': 84, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:49:59.058672', 'step': 84, 'epoch': 1} {'type': 'loss', 'content': 0.4185166656970978, 'timestamp': '2025-09-05 08:49:59.060765', 'step': 85, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:59.257019', 'step': 85, 'epoch': 1} {'type': 'loss', 'content': 0.37654638290405273, 'timestamp': '2025-09-05 08:49:59.259103', 'step': 86, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:49:59.468228', 'step': 86, 'epoch': 1} {'type': 'loss', 'content': 0.4784839451313019, 'timestamp': '2025-09-05 08:49:59.470022', 'step': 87, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:49:59.668389', 'step': 87, 'epoch': 1} {'type': 'loss', 'content': 0.4580772817134857, 'timestamp': '2025-09-05 08:49:59.684569', 'step': 88, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:49:59.884885', 'step': 88, 'epoch': 1} {'type': 'loss', 'content': 0.4850596487522125, 'timestamp': '2025-09-05 08:49:59.887173', 'step': 89, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:00.083921', 'step': 89, 'epoch': 1} {'type': 'loss', 'content': 0.4436725974082947, 'timestamp': '2025-09-05 08:50:00.085702', 'step': 90, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:00.292103', 'step': 90, 'epoch': 1} {'type': 'loss', 'content': 0.3864012360572815, 'timestamp': '2025-09-05 08:50:00.294189', 'step': 91, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:00.493198', 'step': 91, 'epoch': 1} {'type': 'loss', 'content': 0.41005319356918335, 'timestamp': '2025-09-05 08:50:00.507541', 'step': 92, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:00.697222', 'step': 92, 'epoch': 1} {'type': 'loss', 'content': 0.29754865169525146, 'timestamp': '2025-09-05 08:50:00.699018', 'step': 93, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:00.895786', 'step': 93, 'epoch': 1} {'type': 'loss', 'content': 0.5100644826889038, 'timestamp': '2025-09-05 08:50:00.897573', 'step': 94, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:50:01.104437', 'step': 94, 'epoch': 1} {'type': 'loss', 'content': 0.5071147680282593, 'timestamp': '2025-09-05 08:50:01.106343', 'step': 95, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:01.304995', 'step': 95, 'epoch': 1} {'type': 'loss', 'content': 0.5573170781135559, 'timestamp': '2025-09-05 08:50:01.319332', 'step': 96, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:01.517000', 'step': 96, 'epoch': 1} {'type': 'loss', 'content': 0.5759362578392029, 'timestamp': '2025-09-05 08:50:01.518928', 'step': 97, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:01.718213', 'step': 97, 'epoch': 1} {'type': 'loss', 'content': 0.44677627086639404, 'timestamp': '2025-09-05 08:50:01.720227', 'step': 98, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:01.927026', 'step': 98, 'epoch': 1} {'type': 'loss', 'content': 0.5631275177001953, 'timestamp': '2025-09-05 08:50:01.928989', 'step': 99, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:02.126438', 'step': 99, 'epoch': 1} {'type': 'loss', 'content': 0.39676016569137573, 'timestamp': '2025-09-05 08:50:02.141475', 'step': 100, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:50:06.771451', 'step': 100, 'epoch': 1} {'type': 'pplx', 'content': 73.19032358278096, 'timestamp': '2025-09-05 08:50:06.773213', 'step': 100, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:06.935894', 'step': 100, 'epoch': 1} {'type': 'loss', 'content': 0.34731829166412354, 'timestamp': '2025-09-05 08:50:06.937859', 'step': 101, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:50:07.147849', 'step': 101, 'epoch': 1} {'type': 'loss', 'content': 0.36952462792396545, 'timestamp': '2025-09-05 08:50:07.149839', 'step': 102, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:07.355748', 'step': 102, 'epoch': 1} {'type': 'loss', 'content': 0.4270903468132019, 'timestamp': '2025-09-05 08:50:07.357775', 'step': 103, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:07.555093', 'step': 103, 'epoch': 1} {'type': 'loss', 'content': 0.3678099513053894, 'timestamp': '2025-09-05 08:50:07.569703', 'step': 104, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:07.766139', 'step': 104, 'epoch': 1} {'type': 'loss', 'content': 0.40596431493759155, 'timestamp': '2025-09-05 08:50:07.768238', 'step': 105, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:07.965392', 'step': 105, 'epoch': 1} {'type': 'loss', 'content': 0.44920292496681213, 'timestamp': '2025-09-05 08:50:07.967648', 'step': 106, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:08.174690', 'step': 106, 'epoch': 1} {'type': 'loss', 'content': 0.27072614431381226, 'timestamp': '2025-09-05 08:50:08.176704', 'step': 107, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:08.374070', 'step': 107, 'epoch': 1} {'type': 'loss', 'content': 0.44459718465805054, 'timestamp': '2025-09-05 08:50:08.388419', 'step': 108, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:50:08.578453', 'step': 108, 'epoch': 1} {'type': 'loss', 'content': 0.3547823131084442, 'timestamp': '2025-09-05 08:50:08.580293', 'step': 109, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:08.777417', 'step': 109, 'epoch': 1} {'type': 'loss', 'content': 0.32400763034820557, 'timestamp': '2025-09-05 08:50:08.779441', 'step': 110, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:08.985458', 'step': 110, 'epoch': 1} {'type': 'loss', 'content': 0.3592005968093872, 'timestamp': '2025-09-05 08:50:08.987409', 'step': 111, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:09.193950', 'step': 111, 'epoch': 1} {'type': 'loss', 'content': 0.6158202886581421, 'timestamp': '2025-09-05 08:50:09.208214', 'step': 112, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:09.399264', 'step': 112, 'epoch': 1} {'type': 'loss', 'content': 0.36427071690559387, 'timestamp': '2025-09-05 08:50:09.401514', 'step': 113, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:50:09.610077', 'step': 113, 'epoch': 1} {'type': 'loss', 'content': 0.28574255108833313, 'timestamp': '2025-09-05 08:50:09.611869', 'step': 114, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:09.810026', 'step': 114, 'epoch': 1} {'type': 'loss', 'content': 0.33345794677734375, 'timestamp': '2025-09-05 08:50:09.812004', 'step': 115, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:10.009667', 'step': 115, 'epoch': 1} {'type': 'loss', 'content': 0.368472695350647, 'timestamp': '2025-09-05 08:50:10.026099', 'step': 116, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:10.224608', 'step': 116, 'epoch': 1} {'type': 'loss', 'content': 0.3890371322631836, 'timestamp': '2025-09-05 08:50:10.226530', 'step': 117, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:10.433837', 'step': 117, 'epoch': 1} {'type': 'loss', 'content': 0.44138211011886597, 'timestamp': '2025-09-05 08:50:10.435730', 'step': 118, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:50:10.652742', 'step': 118, 'epoch': 1} {'type': 'loss', 'content': 0.3401064872741699, 'timestamp': '2025-09-05 08:50:10.654683', 'step': 119, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:10.860449', 'step': 119, 'epoch': 1} {'type': 'loss', 'content': 0.35354849696159363, 'timestamp': '2025-09-05 08:50:10.876223', 'step': 120, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:50:15.516995', 'step': 120, 'epoch': 1} {'type': 'pplx', 'content': 71.52772055113894, 'timestamp': '2025-09-05 08:50:15.519261', 'step': 120, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 120', 'timestamp': '2025-09-05 08:50:15.977316', 'step': 120, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:50:16.152308', 'step': 120, 'epoch': 1} {'type': 'loss', 'content': 0.4743736684322357, 'timestamp': '2025-09-05 08:50:16.155092', 'step': 121, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:16.352168', 'step': 121, 'epoch': 1} {'type': 'loss', 'content': 0.36207839846611023, 'timestamp': '2025-09-05 08:50:16.354167', 'step': 122, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:50:16.554847', 'step': 122, 'epoch': 1} {'type': 'loss', 'content': 0.4461155831813812, 'timestamp': '2025-09-05 08:50:16.556726', 'step': 123, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:50:16.765987', 'step': 123, 'epoch': 1} {'type': 'loss', 'content': 0.3289283215999603, 'timestamp': '2025-09-05 08:50:16.780395', 'step': 124, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:16.978534', 'step': 124, 'epoch': 1} {'type': 'loss', 'content': 0.39339011907577515, 'timestamp': '2025-09-05 08:50:16.980573', 'step': 125, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:17.179671', 'step': 125, 'epoch': 1} {'type': 'loss', 'content': 0.3922661244869232, 'timestamp': '2025-09-05 08:50:17.181546', 'step': 126, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:17.379811', 'step': 126, 'epoch': 1} {'type': 'loss', 'content': 0.48367026448249817, 'timestamp': '2025-09-05 08:50:17.381618', 'step': 127, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:17.579726', 'step': 127, 'epoch': 1} {'type': 'loss', 'content': 0.3444158434867859, 'timestamp': '2025-09-05 08:50:17.596345', 'step': 128, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:50:17.793894', 'step': 128, 'epoch': 1} {'type': 'loss', 'content': 0.4369712471961975, 'timestamp': '2025-09-05 08:50:17.795934', 'step': 129, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:17.993686', 'step': 129, 'epoch': 1} {'type': 'loss', 'content': 0.331216037273407, 'timestamp': '2025-09-05 08:50:17.995933', 'step': 130, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:18.193092', 'step': 130, 'epoch': 1} {'type': 'loss', 'content': 0.3580273389816284, 'timestamp': '2025-09-05 08:50:18.195148', 'step': 131, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:18.392606', 'step': 131, 'epoch': 1} {'type': 'loss', 'content': 0.31754398345947266, 'timestamp': '2025-09-05 08:50:18.407026', 'step': 132, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:50:18.594763', 'step': 132, 'epoch': 1} {'type': 'loss', 'content': 0.5454202890396118, 'timestamp': '2025-09-05 08:50:18.596921', 'step': 133, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:18.794716', 'step': 133, 'epoch': 1} {'type': 'loss', 'content': 0.43953007459640503, 'timestamp': '2025-09-05 08:50:18.796695', 'step': 134, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:18.994641', 'step': 134, 'epoch': 1} {'type': 'loss', 'content': 0.31143492460250854, 'timestamp': '2025-09-05 08:50:18.996835', 'step': 135, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:19.193535', 'step': 135, 'epoch': 1} {'type': 'loss', 'content': 0.48143064975738525, 'timestamp': '2025-09-05 08:50:19.210065', 'step': 136, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:19.406796', 'step': 136, 'epoch': 1} {'type': 'loss', 'content': 0.3588726818561554, 'timestamp': '2025-09-05 08:50:19.408708', 'step': 137, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:19.606770', 'step': 137, 'epoch': 1} {'type': 'loss', 'content': 0.38282686471939087, 'timestamp': '2025-09-05 08:50:19.609583', 'step': 138, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:50:19.815782', 'step': 138, 'epoch': 1} {'type': 'loss', 'content': 0.42063143849372864, 'timestamp': '2025-09-05 08:50:19.818967', 'step': 139, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:20.017464', 'step': 139, 'epoch': 1} {'type': 'loss', 'content': 0.4207852780818939, 'timestamp': '2025-09-05 08:50:20.031795', 'step': 140, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:50:24.652104', 'step': 140, 'epoch': 1} {'type': 'pplx', 'content': 70.65361235350733, 'timestamp': '2025-09-05 08:50:24.654036', 'step': 140, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:24.817137', 'step': 140, 'epoch': 1} {'type': 'loss', 'content': 0.34773746132850647, 'timestamp': '2025-09-05 08:50:24.819048', 'step': 141, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:25.026073', 'step': 141, 'epoch': 1} {'type': 'loss', 'content': 0.4449472725391388, 'timestamp': '2025-09-05 08:50:25.027902', 'step': 142, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:25.224932', 'step': 142, 'epoch': 1} {'type': 'loss', 'content': 0.4693523645401001, 'timestamp': '2025-09-05 08:50:25.231172', 'step': 143, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:50:25.439007', 'step': 143, 'epoch': 1} {'type': 'loss', 'content': 0.3564371168613434, 'timestamp': '2025-09-05 08:50:25.453380', 'step': 144, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:25.650784', 'step': 144, 'epoch': 1} {'type': 'loss', 'content': 0.37880274653434753, 'timestamp': '2025-09-05 08:50:25.652686', 'step': 145, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:25.859559', 'step': 145, 'epoch': 1} {'type': 'loss', 'content': 0.31610339879989624, 'timestamp': '2025-09-05 08:50:25.862056', 'step': 146, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:26.059662', 'step': 146, 'epoch': 1} {'type': 'loss', 'content': 0.46133285760879517, 'timestamp': '2025-09-05 08:50:26.061481', 'step': 147, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:50:26.268593', 'step': 147, 'epoch': 1} {'type': 'loss', 'content': 0.413718044757843, 'timestamp': '2025-09-05 08:50:26.283153', 'step': 148, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:50:26.469459', 'step': 148, 'epoch': 1} {'type': 'loss', 'content': 0.5549363493919373, 'timestamp': '2025-09-05 08:50:26.471379', 'step': 149, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:26.677868', 'step': 149, 'epoch': 1} {'type': 'loss', 'content': 0.29777753353118896, 'timestamp': '2025-09-05 08:50:26.679559', 'step': 150, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:50:26.875740', 'step': 150, 'epoch': 1} {'type': 'loss', 'content': 0.2828120291233063, 'timestamp': '2025-09-05 08:50:26.878140', 'step': 151, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:50:27.073797', 'step': 151, 'epoch': 1} {'type': 'loss', 'content': 0.4168826639652252, 'timestamp': '2025-09-05 08:50:27.088398', 'step': 152, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:27.277947', 'step': 152, 'epoch': 1} {'type': 'loss', 'content': 0.35562440752983093, 'timestamp': '2025-09-05 08:50:27.279668', 'step': 153, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:50:27.475780', 'step': 153, 'epoch': 1} {'type': 'loss', 'content': 0.4816155433654785, 'timestamp': '2025-09-05 08:50:27.477761', 'step': 154, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:27.675934', 'step': 154, 'epoch': 1} {'type': 'loss', 'content': 0.48008760809898376, 'timestamp': '2025-09-05 08:50:27.677685', 'step': 155, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:27.883926', 'step': 155, 'epoch': 1} {'type': 'loss', 'content': 0.42271170020103455, 'timestamp': '2025-09-05 08:50:27.898295', 'step': 156, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:28.088213', 'step': 156, 'epoch': 1} {'type': 'loss', 'content': 0.3952147960662842, 'timestamp': '2025-09-05 08:50:28.090116', 'step': 157, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:28.286788', 'step': 157, 'epoch': 1} {'type': 'loss', 'content': 0.3233911097049713, 'timestamp': '2025-09-05 08:50:28.288735', 'step': 158, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:50:28.496521', 'step': 158, 'epoch': 1} {'type': 'loss', 'content': 0.49929437041282654, 'timestamp': '2025-09-05 08:50:28.498313', 'step': 159, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:28.704841', 'step': 159, 'epoch': 1} {'type': 'loss', 'content': 0.4313942492008209, 'timestamp': '2025-09-05 08:50:28.719313', 'step': 160, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:50:33.321295', 'step': 160, 'epoch': 1} {'type': 'pplx', 'content': 70.65175573066632, 'timestamp': '2025-09-05 08:50:33.323369', 'step': 160, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 160', 'timestamp': '2025-09-05 08:50:33.770272', 'step': 160, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:33.957466', 'step': 160, 'epoch': 1} {'type': 'loss', 'content': 0.34339627623558044, 'timestamp': '2025-09-05 08:50:33.959506', 'step': 161, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:50:34.164500', 'step': 161, 'epoch': 1} {'type': 'loss', 'content': 0.45955607295036316, 'timestamp': '2025-09-05 08:50:34.166511', 'step': 162, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:50:34.364395', 'step': 162, 'epoch': 1} {'type': 'loss', 'content': 0.4148689806461334, 'timestamp': '2025-09-05 08:50:34.366375', 'step': 163, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:34.572456', 'step': 163, 'epoch': 1} {'type': 'loss', 'content': 0.34146106243133545, 'timestamp': '2025-09-05 08:50:34.588946', 'step': 164, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:34.786564', 'step': 164, 'epoch': 1} {'type': 'loss', 'content': 0.3519960045814514, 'timestamp': '2025-09-05 08:50:34.788408', 'step': 165, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:34.985928', 'step': 165, 'epoch': 1} {'type': 'loss', 'content': 0.2612743675708771, 'timestamp': '2025-09-05 08:50:34.988031', 'step': 166, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:35.186266', 'step': 166, 'epoch': 1} {'type': 'loss', 'content': 0.37086915969848633, 'timestamp': '2025-09-05 08:50:35.188111', 'step': 167, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:35.395045', 'step': 167, 'epoch': 1} {'type': 'loss', 'content': 0.3923819661140442, 'timestamp': '2025-09-05 08:50:35.411448', 'step': 168, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:35.605525', 'step': 168, 'epoch': 1} {'type': 'loss', 'content': 0.5148409605026245, 'timestamp': '2025-09-05 08:50:35.607404', 'step': 169, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:50:35.803412', 'step': 169, 'epoch': 1} {'type': 'loss', 'content': 0.34032538533210754, 'timestamp': '2025-09-05 08:50:35.805098', 'step': 170, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:36.010702', 'step': 170, 'epoch': 1} {'type': 'loss', 'content': 0.5385991930961609, 'timestamp': '2025-09-05 08:50:36.012528', 'step': 171, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:50:36.211564', 'step': 171, 'epoch': 1} {'type': 'loss', 'content': 0.3534490466117859, 'timestamp': '2025-09-05 08:50:36.225837', 'step': 172, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:50:36.419293', 'step': 172, 'epoch': 1} {'type': 'loss', 'content': 0.3421189785003662, 'timestamp': '2025-09-05 08:50:36.421098', 'step': 173, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:36.626841', 'step': 173, 'epoch': 1} {'type': 'loss', 'content': 0.2873148024082184, 'timestamp': '2025-09-05 08:50:36.628694', 'step': 174, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:36.826986', 'step': 174, 'epoch': 1} {'type': 'loss', 'content': 0.28435009717941284, 'timestamp': '2025-09-05 08:50:36.828813', 'step': 175, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:37.025748', 'step': 175, 'epoch': 1} {'type': 'loss', 'content': 0.3528280258178711, 'timestamp': '2025-09-05 08:50:37.039984', 'step': 176, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:50:37.229084', 'step': 176, 'epoch': 1} {'type': 'loss', 'content': 0.4834965467453003, 'timestamp': '2025-09-05 08:50:37.230935', 'step': 177, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:37.428456', 'step': 177, 'epoch': 1} {'type': 'loss', 'content': 0.33752936124801636, 'timestamp': '2025-09-05 08:50:37.430302', 'step': 178, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:37.636783', 'step': 178, 'epoch': 1} {'type': 'loss', 'content': 0.44458168745040894, 'timestamp': '2025-09-05 08:50:37.638938', 'step': 179, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:37.837233', 'step': 179, 'epoch': 1} {'type': 'loss', 'content': 0.40980052947998047, 'timestamp': '2025-09-05 08:50:37.853984', 'step': 180, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:50:42.486191', 'step': 180, 'epoch': 1} {'type': 'pplx', 'content': 69.88217681460745, 'timestamp': '2025-09-05 08:50:42.488185', 'step': 180, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:50:42.649403', 'step': 180, 'epoch': 1} {'type': 'loss', 'content': 0.3106338381767273, 'timestamp': '2025-09-05 08:50:42.653048', 'step': 181, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:42.858542', 'step': 181, 'epoch': 1} {'type': 'loss', 'content': 0.33472582697868347, 'timestamp': '2025-09-05 08:50:42.861054', 'step': 182, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:50:43.059529', 'step': 182, 'epoch': 1} {'type': 'loss', 'content': 0.5153182148933411, 'timestamp': '2025-09-05 08:50:43.061699', 'step': 183, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:43.259054', 'step': 183, 'epoch': 1} {'type': 'loss', 'content': 0.3759649693965912, 'timestamp': '2025-09-05 08:50:43.275668', 'step': 184, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:43.473055', 'step': 184, 'epoch': 1} {'type': 'loss', 'content': 0.25464075803756714, 'timestamp': '2025-09-05 08:50:43.475222', 'step': 185, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:43.672348', 'step': 185, 'epoch': 1} {'type': 'loss', 'content': 0.46503615379333496, 'timestamp': '2025-09-05 08:50:43.674547', 'step': 186, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:43.870814', 'step': 186, 'epoch': 1} {'type': 'loss', 'content': 0.36886751651763916, 'timestamp': '2025-09-05 08:50:43.872745', 'step': 187, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:44.061475', 'step': 187, 'epoch': 1} {'type': 'loss', 'content': 0.31488415598869324, 'timestamp': '2025-09-05 08:50:44.076163', 'step': 188, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:44.257798', 'step': 188, 'epoch': 1} {'type': 'loss', 'content': 0.41387924551963806, 'timestamp': '2025-09-05 08:50:44.259666', 'step': 189, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:44.463752', 'step': 189, 'epoch': 1} {'type': 'loss', 'content': 0.3407166600227356, 'timestamp': '2025-09-05 08:50:44.465741', 'step': 190, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:44.662388', 'step': 190, 'epoch': 1} {'type': 'loss', 'content': 0.3083406686782837, 'timestamp': '2025-09-05 08:50:44.666316', 'step': 191, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:44.866525', 'step': 191, 'epoch': 1} {'type': 'loss', 'content': 0.46717512607574463, 'timestamp': '2025-09-05 08:50:44.882923', 'step': 192, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:45.079418', 'step': 192, 'epoch': 1} {'type': 'loss', 'content': 0.37655341625213623, 'timestamp': '2025-09-05 08:50:45.082245', 'step': 193, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:45.286188', 'step': 193, 'epoch': 1} {'type': 'loss', 'content': 0.2714085876941681, 'timestamp': '2025-09-05 08:50:45.289071', 'step': 194, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:45.489108', 'step': 194, 'epoch': 1} {'type': 'loss', 'content': 0.44437673687934875, 'timestamp': '2025-09-05 08:50:45.491274', 'step': 195, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:45.693314', 'step': 195, 'epoch': 1} {'type': 'loss', 'content': 0.525282621383667, 'timestamp': '2025-09-05 08:50:45.708374', 'step': 196, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:50:45.900556', 'step': 196, 'epoch': 1} {'type': 'loss', 'content': 0.3641051650047302, 'timestamp': '2025-09-05 08:50:45.905337', 'step': 197, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:46.107005', 'step': 197, 'epoch': 1} {'type': 'loss', 'content': 0.306657075881958, 'timestamp': '2025-09-05 08:50:46.109337', 'step': 198, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:46.315314', 'step': 198, 'epoch': 1} {'type': 'loss', 'content': 0.3821370601654053, 'timestamp': '2025-09-05 08:50:46.318001', 'step': 199, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:46.530150', 'step': 199, 'epoch': 1} {'type': 'loss', 'content': 0.3802856504917145, 'timestamp': '2025-09-05 08:50:46.546252', 'step': 200, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:50:51.304338', 'step': 200, 'epoch': 1} {'type': 'pplx', 'content': 68.43342716093302, 'timestamp': '2025-09-05 08:50:51.307018', 'step': 200, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 200', 'timestamp': '2025-09-05 08:50:51.794754', 'step': 200, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:51.964613', 'step': 200, 'epoch': 1} {'type': 'loss', 'content': 0.32048162817955017, 'timestamp': '2025-09-05 08:50:51.966794', 'step': 201, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:52.173366', 'step': 201, 'epoch': 1} {'type': 'loss', 'content': 0.3719877600669861, 'timestamp': '2025-09-05 08:50:52.175903', 'step': 202, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:50:52.374299', 'step': 202, 'epoch': 1} {'type': 'loss', 'content': 0.47192662954330444, 'timestamp': '2025-09-05 08:50:52.376239', 'step': 203, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:52.573112', 'step': 203, 'epoch': 1} {'type': 'loss', 'content': 0.35540571808815, 'timestamp': '2025-09-05 08:50:52.587796', 'step': 204, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:50:52.777809', 'step': 204, 'epoch': 1} {'type': 'loss', 'content': 0.4534974694252014, 'timestamp': '2025-09-05 08:50:52.779680', 'step': 205, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:52.976475', 'step': 205, 'epoch': 1} {'type': 'loss', 'content': 0.2541635036468506, 'timestamp': '2025-09-05 08:50:52.978872', 'step': 206, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:53.186056', 'step': 206, 'epoch': 1} {'type': 'loss', 'content': 0.4390164017677307, 'timestamp': '2025-09-05 08:50:53.187977', 'step': 207, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:53.394871', 'step': 207, 'epoch': 1} {'type': 'loss', 'content': 0.36420944333076477, 'timestamp': '2025-09-05 08:50:53.409950', 'step': 208, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:53.598684', 'step': 208, 'epoch': 1} {'type': 'loss', 'content': 0.3081270754337311, 'timestamp': '2025-09-05 08:50:53.601276', 'step': 209, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:53.807163', 'step': 209, 'epoch': 1} {'type': 'loss', 'content': 0.32181161642074585, 'timestamp': '2025-09-05 08:50:53.809123', 'step': 210, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:54.016227', 'step': 210, 'epoch': 1} {'type': 'loss', 'content': 0.37587931752204895, 'timestamp': '2025-09-05 08:50:54.018183', 'step': 211, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:50:54.224894', 'step': 211, 'epoch': 1} {'type': 'loss', 'content': 0.3801794648170471, 'timestamp': '2025-09-05 08:50:54.241161', 'step': 212, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:54.438898', 'step': 212, 'epoch': 1} {'type': 'loss', 'content': 0.3296459913253784, 'timestamp': '2025-09-05 08:50:54.440903', 'step': 213, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:50:54.636463', 'step': 213, 'epoch': 1} {'type': 'loss', 'content': 0.40059760212898254, 'timestamp': '2025-09-05 08:50:54.638462', 'step': 214, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:50:54.845342', 'step': 214, 'epoch': 1} {'type': 'loss', 'content': 0.45545437932014465, 'timestamp': '2025-09-05 08:50:54.847321', 'step': 215, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:50:55.045554', 'step': 215, 'epoch': 1} {'type': 'loss', 'content': 0.5213336944580078, 'timestamp': '2025-09-05 08:50:55.059885', 'step': 216, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:50:55.248331', 'step': 216, 'epoch': 1} {'type': 'loss', 'content': 0.3463953733444214, 'timestamp': '2025-09-05 08:50:55.250555', 'step': 217, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:55.456155', 'step': 217, 'epoch': 1} {'type': 'loss', 'content': 0.36651042103767395, 'timestamp': '2025-09-05 08:50:55.458184', 'step': 218, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:55.664880', 'step': 218, 'epoch': 1} {'type': 'loss', 'content': 0.39168816804885864, 'timestamp': '2025-09-05 08:50:55.666768', 'step': 219, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:50:55.871436', 'step': 219, 'epoch': 1} {'type': 'loss', 'content': 0.43237030506134033, 'timestamp': '2025-09-05 08:50:55.885583', 'step': 220, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:51:00.528160', 'step': 220, 'epoch': 1} {'type': 'pplx', 'content': 66.70753911619636, 'timestamp': '2025-09-05 08:51:00.530320', 'step': 220, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:00.694163', 'step': 220, 'epoch': 1} {'type': 'loss', 'content': 0.27549687027931213, 'timestamp': '2025-09-05 08:51:00.696251', 'step': 221, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:00.863090', 'step': 221, 'epoch': 1} {'type': 'loss', 'content': 0.4633401036262512, 'timestamp': '2025-09-05 08:51:00.865224', 'step': 222, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:01.071101', 'step': 222, 'epoch': 1} {'type': 'loss', 'content': 0.297171413898468, 'timestamp': '2025-09-05 08:51:01.073322', 'step': 223, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:01.281221', 'step': 223, 'epoch': 1} {'type': 'loss', 'content': 0.38710904121398926, 'timestamp': '2025-09-05 08:51:01.298898', 'step': 224, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:01.496439', 'step': 224, 'epoch': 1} {'type': 'loss', 'content': 0.4238070249557495, 'timestamp': '2025-09-05 08:51:01.498489', 'step': 225, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:01.706000', 'step': 225, 'epoch': 1} {'type': 'loss', 'content': 0.4317415952682495, 'timestamp': '2025-09-05 08:51:01.708487', 'step': 226, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:01.908165', 'step': 226, 'epoch': 1} {'type': 'loss', 'content': 0.31439393758773804, 'timestamp': '2025-09-05 08:51:01.910255', 'step': 227, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:02.116936', 'step': 227, 'epoch': 1} {'type': 'loss', 'content': 0.2890167236328125, 'timestamp': '2025-09-05 08:51:02.131642', 'step': 228, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:51:02.324743', 'step': 228, 'epoch': 1} {'type': 'loss', 'content': 0.4581849277019501, 'timestamp': '2025-09-05 08:51:02.327444', 'step': 229, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:02.525196', 'step': 229, 'epoch': 1} {'type': 'loss', 'content': 0.3810874819755554, 'timestamp': '2025-09-05 08:51:02.527445', 'step': 230, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:02.733657', 'step': 230, 'epoch': 1} {'type': 'loss', 'content': 0.2984119653701782, 'timestamp': '2025-09-05 08:51:02.736785', 'step': 231, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:02.934302', 'step': 231, 'epoch': 1} {'type': 'loss', 'content': 0.37125611305236816, 'timestamp': '2025-09-05 08:51:02.948305', 'step': 232, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:03.144451', 'step': 232, 'epoch': 1} {'type': 'loss', 'content': 0.2847510874271393, 'timestamp': '2025-09-05 08:51:03.146597', 'step': 233, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:03.352972', 'step': 233, 'epoch': 1} {'type': 'loss', 'content': 0.33447766304016113, 'timestamp': '2025-09-05 08:51:03.355044', 'step': 234, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:03.552131', 'step': 234, 'epoch': 1} {'type': 'loss', 'content': 0.32979437708854675, 'timestamp': '2025-09-05 08:51:03.554213', 'step': 235, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:03.751051', 'step': 235, 'epoch': 1} {'type': 'loss', 'content': 0.45558422803878784, 'timestamp': '2025-09-05 08:51:03.764967', 'step': 236, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:03.953115', 'step': 236, 'epoch': 1} {'type': 'loss', 'content': 0.3728318214416504, 'timestamp': '2025-09-05 08:51:03.955116', 'step': 237, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:51:04.152915', 'step': 237, 'epoch': 1} {'type': 'loss', 'content': 0.31200921535491943, 'timestamp': '2025-09-05 08:51:04.155039', 'step': 238, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:04.360922', 'step': 238, 'epoch': 1} {'type': 'loss', 'content': 0.548952579498291, 'timestamp': '2025-09-05 08:51:04.362714', 'step': 239, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:04.560346', 'step': 239, 'epoch': 1} {'type': 'loss', 'content': 0.28765401244163513, 'timestamp': '2025-09-05 08:51:04.576314', 'step': 240, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:51:09.223025', 'step': 240, 'epoch': 1} {'type': 'pplx', 'content': 65.06928550317252, 'timestamp': '2025-09-05 08:51:09.225006', 'step': 240, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 240', 'timestamp': '2025-09-05 08:51:09.703186', 'step': 240, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:09.874355', 'step': 240, 'epoch': 1} {'type': 'loss', 'content': 0.3755890130996704, 'timestamp': '2025-09-05 08:51:09.876365', 'step': 241, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:10.072449', 'step': 241, 'epoch': 1} {'type': 'loss', 'content': 0.38877207040786743, 'timestamp': '2025-09-05 08:51:10.074440', 'step': 242, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:10.280829', 'step': 242, 'epoch': 1} {'type': 'loss', 'content': 0.49628746509552, 'timestamp': '2025-09-05 08:51:10.282995', 'step': 243, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:10.482218', 'step': 243, 'epoch': 1} {'type': 'loss', 'content': 0.5397868752479553, 'timestamp': '2025-09-05 08:51:10.498957', 'step': 244, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:10.696694', 'step': 244, 'epoch': 1} {'type': 'loss', 'content': 0.3450985848903656, 'timestamp': '2025-09-05 08:51:10.698570', 'step': 245, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:10.893683', 'step': 245, 'epoch': 1} {'type': 'loss', 'content': 0.30554214119911194, 'timestamp': '2025-09-05 08:51:10.895563', 'step': 246, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:11.091433', 'step': 246, 'epoch': 1} {'type': 'loss', 'content': 0.3367531895637512, 'timestamp': '2025-09-05 08:51:11.093469', 'step': 247, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:11.298971', 'step': 247, 'epoch': 1} {'type': 'loss', 'content': 0.33166182041168213, 'timestamp': '2025-09-05 08:51:11.313272', 'step': 248, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:11.503053', 'step': 248, 'epoch': 1} {'type': 'loss', 'content': 0.2959182858467102, 'timestamp': '2025-09-05 08:51:11.505145', 'step': 249, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:11.703129', 'step': 249, 'epoch': 1} {'type': 'loss', 'content': 0.3209533095359802, 'timestamp': '2025-09-05 08:51:11.705161', 'step': 250, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:11.902631', 'step': 250, 'epoch': 1} {'type': 'loss', 'content': 0.3812739849090576, 'timestamp': '2025-09-05 08:51:11.904804', 'step': 251, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:51:12.103336', 'step': 251, 'epoch': 1} {'type': 'loss', 'content': 0.31804463267326355, 'timestamp': '2025-09-05 08:51:12.119920', 'step': 252, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:12.325726', 'step': 252, 'epoch': 1} {'type': 'loss', 'content': 0.38454669713974, 'timestamp': '2025-09-05 08:51:12.327480', 'step': 253, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:12.533970', 'step': 253, 'epoch': 1} {'type': 'loss', 'content': 0.357085257768631, 'timestamp': '2025-09-05 08:51:12.535934', 'step': 254, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:12.732659', 'step': 254, 'epoch': 1} {'type': 'loss', 'content': 0.5086879730224609, 'timestamp': '2025-09-05 08:51:12.734686', 'step': 255, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:51:12.942675', 'step': 255, 'epoch': 1} {'type': 'loss', 'content': 0.31578245759010315, 'timestamp': '2025-09-05 08:51:12.957030', 'step': 256, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:13.145896', 'step': 256, 'epoch': 1} {'type': 'loss', 'content': 0.530243456363678, 'timestamp': '2025-09-05 08:51:13.147719', 'step': 257, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:13.344584', 'step': 257, 'epoch': 1} {'type': 'loss', 'content': 0.3682427406311035, 'timestamp': '2025-09-05 08:51:13.346470', 'step': 258, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:13.543961', 'step': 258, 'epoch': 1} {'type': 'loss', 'content': 0.3959914743900299, 'timestamp': '2025-09-05 08:51:13.545895', 'step': 259, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:51:13.744614', 'step': 259, 'epoch': 1} {'type': 'loss', 'content': 0.46029427647590637, 'timestamp': '2025-09-05 08:51:13.758973', 'step': 260, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:51:18.409861', 'step': 260, 'epoch': 1} {'type': 'pplx', 'content': 64.26154569680897, 'timestamp': '2025-09-05 08:51:18.412481', 'step': 260, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:18.575862', 'step': 260, 'epoch': 1} {'type': 'loss', 'content': 0.4201185405254364, 'timestamp': '2025-09-05 08:51:18.579922', 'step': 261, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:18.749512', 'step': 261, 'epoch': 1} {'type': 'loss', 'content': 0.34828004240989685, 'timestamp': '2025-09-05 08:51:18.751937', 'step': 262, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:18.961324', 'step': 262, 'epoch': 1} {'type': 'loss', 'content': 0.3680271506309509, 'timestamp': '2025-09-05 08:51:18.963334', 'step': 263, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:19.161929', 'step': 263, 'epoch': 1} {'type': 'loss', 'content': 0.430779367685318, 'timestamp': '2025-09-05 08:51:19.177248', 'step': 264, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:19.370004', 'step': 264, 'epoch': 1} {'type': 'loss', 'content': 0.4136819839477539, 'timestamp': '2025-09-05 08:51:19.372994', 'step': 265, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:19.582552', 'step': 265, 'epoch': 1} {'type': 'loss', 'content': 0.33954256772994995, 'timestamp': '2025-09-05 08:51:19.585434', 'step': 266, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:19.788353', 'step': 266, 'epoch': 1} {'type': 'loss', 'content': 0.3512232005596161, 'timestamp': '2025-09-05 08:51:19.791506', 'step': 267, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:20.000570', 'step': 267, 'epoch': 1} {'type': 'loss', 'content': 0.22923022508621216, 'timestamp': '2025-09-05 08:51:20.016069', 'step': 268, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:20.207996', 'step': 268, 'epoch': 1} {'type': 'loss', 'content': 0.39777079224586487, 'timestamp': '2025-09-05 08:51:20.210365', 'step': 269, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:20.410582', 'step': 269, 'epoch': 1} {'type': 'loss', 'content': 0.34263524413108826, 'timestamp': '2025-09-05 08:51:20.412726', 'step': 270, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:20.611599', 'step': 270, 'epoch': 1} {'type': 'loss', 'content': 0.33798688650131226, 'timestamp': '2025-09-05 08:51:20.616122', 'step': 271, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:20.815141', 'step': 271, 'epoch': 1} {'type': 'loss', 'content': 0.5267437696456909, 'timestamp': '2025-09-05 08:51:20.834848', 'step': 272, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:21.034317', 'step': 272, 'epoch': 1} {'type': 'loss', 'content': 0.41006460785865784, 'timestamp': '2025-09-05 08:51:21.038358', 'step': 273, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:21.244494', 'step': 273, 'epoch': 1} {'type': 'loss', 'content': 0.4595869779586792, 'timestamp': '2025-09-05 08:51:21.247391', 'step': 274, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:21.450319', 'step': 274, 'epoch': 1} {'type': 'loss', 'content': 0.35671982169151306, 'timestamp': '2025-09-05 08:51:21.452813', 'step': 275, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:21.662382', 'step': 275, 'epoch': 1} {'type': 'loss', 'content': 0.3464692234992981, 'timestamp': '2025-09-05 08:51:21.678478', 'step': 276, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:21.873740', 'step': 276, 'epoch': 1} {'type': 'loss', 'content': 0.513616144657135, 'timestamp': '2025-09-05 08:51:21.876507', 'step': 277, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:22.086577', 'step': 277, 'epoch': 1} {'type': 'loss', 'content': 0.3644759953022003, 'timestamp': '2025-09-05 08:51:22.090146', 'step': 278, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:22.295231', 'step': 278, 'epoch': 1} {'type': 'loss', 'content': 0.30014386773109436, 'timestamp': '2025-09-05 08:51:22.302241', 'step': 279, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:22.516028', 'step': 279, 'epoch': 1} {'type': 'loss', 'content': 0.3263522684574127, 'timestamp': '2025-09-05 08:51:22.532144', 'step': 280, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:51:27.228302', 'step': 280, 'epoch': 1} {'type': 'pplx', 'content': 63.39580635831405, 'timestamp': '2025-09-05 08:51:27.231907', 'step': 280, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 280', 'timestamp': '2025-09-05 08:51:27.737466', 'step': 280, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:27.903601', 'step': 280, 'epoch': 1} {'type': 'loss', 'content': 0.2871383726596832, 'timestamp': '2025-09-05 08:51:27.905516', 'step': 281, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:28.113830', 'step': 281, 'epoch': 1} {'type': 'loss', 'content': 0.5244265198707581, 'timestamp': '2025-09-05 08:51:28.116159', 'step': 282, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:28.287939', 'step': 282, 'epoch': 1} {'type': 'loss', 'content': 0.30767932534217834, 'timestamp': '2025-09-05 08:51:28.290430', 'step': 283, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:28.498428', 'step': 283, 'epoch': 1} {'type': 'loss', 'content': 0.42134183645248413, 'timestamp': '2025-09-05 08:51:28.516281', 'step': 284, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:28.708204', 'step': 284, 'epoch': 1} {'type': 'loss', 'content': 0.3752477467060089, 'timestamp': '2025-09-05 08:51:28.710520', 'step': 285, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:51:28.917749', 'step': 285, 'epoch': 1} {'type': 'loss', 'content': 0.3297032415866852, 'timestamp': '2025-09-05 08:51:28.920190', 'step': 286, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:29.118017', 'step': 286, 'epoch': 1} {'type': 'loss', 'content': 0.3052656054496765, 'timestamp': '2025-09-05 08:51:29.121103', 'step': 287, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:29.326871', 'step': 287, 'epoch': 1} {'type': 'loss', 'content': 0.3355713188648224, 'timestamp': '2025-09-05 08:51:29.344871', 'step': 288, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:51:29.543183', 'step': 288, 'epoch': 1} {'type': 'loss', 'content': 0.35573214292526245, 'timestamp': '2025-09-05 08:51:29.545838', 'step': 289, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:29.753080', 'step': 289, 'epoch': 1} {'type': 'loss', 'content': 0.4543474316596985, 'timestamp': '2025-09-05 08:51:29.756576', 'step': 290, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:29.957195', 'step': 290, 'epoch': 1} {'type': 'loss', 'content': 0.3964625298976898, 'timestamp': '2025-09-05 08:51:29.959974', 'step': 291, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:30.165709', 'step': 291, 'epoch': 1} {'type': 'loss', 'content': 0.3353409469127655, 'timestamp': '2025-09-05 08:51:30.180114', 'step': 292, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:51:30.371786', 'step': 292, 'epoch': 1} {'type': 'loss', 'content': 0.4800986647605896, 'timestamp': '2025-09-05 08:51:30.374286', 'step': 293, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:30.576864', 'step': 293, 'epoch': 1} {'type': 'loss', 'content': 0.3704434633255005, 'timestamp': '2025-09-05 08:51:30.579299', 'step': 294, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:30.780856', 'step': 294, 'epoch': 1} {'type': 'loss', 'content': 0.4373602271080017, 'timestamp': '2025-09-05 08:51:30.784033', 'step': 295, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:30.983714', 'step': 295, 'epoch': 1} {'type': 'loss', 'content': 0.4753832519054413, 'timestamp': '2025-09-05 08:51:30.998392', 'step': 296, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:31.191891', 'step': 296, 'epoch': 1} {'type': 'loss', 'content': 0.4039275646209717, 'timestamp': '2025-09-05 08:51:31.195051', 'step': 297, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:31.409572', 'step': 297, 'epoch': 1} {'type': 'loss', 'content': 0.35360774397850037, 'timestamp': '2025-09-05 08:51:31.411839', 'step': 298, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:31.611994', 'step': 298, 'epoch': 1} {'type': 'loss', 'content': 0.4502357840538025, 'timestamp': '2025-09-05 08:51:31.616480', 'step': 299, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:31.824098', 'step': 299, 'epoch': 1} {'type': 'loss', 'content': 0.34486469626426697, 'timestamp': '2025-09-05 08:51:31.839022', 'step': 300, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:51:36.541321', 'step': 300, 'epoch': 1} {'type': 'pplx', 'content': 62.91448786259217, 'timestamp': '2025-09-05 08:51:36.543514', 'step': 300, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:51:36.706204', 'step': 300, 'epoch': 1} {'type': 'loss', 'content': 0.42969754338264465, 'timestamp': '2025-09-05 08:51:36.708169', 'step': 301, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:36.916708', 'step': 301, 'epoch': 1} {'type': 'loss', 'content': 0.3379324972629547, 'timestamp': '2025-09-05 08:51:36.920013', 'step': 302, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:37.118930', 'step': 302, 'epoch': 1} {'type': 'loss', 'content': 0.28274306654930115, 'timestamp': '2025-09-05 08:51:37.121827', 'step': 303, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:51:37.330465', 'step': 303, 'epoch': 1} {'type': 'loss', 'content': 0.3989158570766449, 'timestamp': '2025-09-05 08:51:37.344698', 'step': 304, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:37.535431', 'step': 304, 'epoch': 1} {'type': 'loss', 'content': 0.45338067412376404, 'timestamp': '2025-09-05 08:51:37.538881', 'step': 305, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:51:37.735743', 'step': 305, 'epoch': 1} {'type': 'loss', 'content': 0.4826226234436035, 'timestamp': '2025-09-05 08:51:37.739435', 'step': 306, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:37.937478', 'step': 306, 'epoch': 1} {'type': 'loss', 'content': 0.23507355153560638, 'timestamp': '2025-09-05 08:51:37.939847', 'step': 307, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:38.148933', 'step': 307, 'epoch': 1} {'type': 'loss', 'content': 0.41786983609199524, 'timestamp': '2025-09-05 08:51:38.165846', 'step': 308, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:38.364288', 'step': 308, 'epoch': 1} {'type': 'loss', 'content': 0.3461589813232422, 'timestamp': '2025-09-05 08:51:38.366570', 'step': 309, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:38.576965', 'step': 309, 'epoch': 1} {'type': 'loss', 'content': 0.4844832420349121, 'timestamp': '2025-09-05 08:51:38.579372', 'step': 310, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:38.787986', 'step': 310, 'epoch': 1} {'type': 'loss', 'content': 0.38971275091171265, 'timestamp': '2025-09-05 08:51:38.790638', 'step': 311, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:51:38.997464', 'step': 311, 'epoch': 1} {'type': 'loss', 'content': 0.474933385848999, 'timestamp': '2025-09-05 08:51:39.013582', 'step': 312, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:51:39.206540', 'step': 312, 'epoch': 1} {'type': 'loss', 'content': 0.37481558322906494, 'timestamp': '2025-09-05 08:51:39.210174', 'step': 313, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:39.410262', 'step': 313, 'epoch': 1} {'type': 'loss', 'content': 0.3350926637649536, 'timestamp': '2025-09-05 08:51:39.412546', 'step': 314, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:39.622336', 'step': 314, 'epoch': 1} {'type': 'loss', 'content': 0.5339053869247437, 'timestamp': '2025-09-05 08:51:39.624825', 'step': 315, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:39.836689', 'step': 315, 'epoch': 1} {'type': 'loss', 'content': 0.3207130432128906, 'timestamp': '2025-09-05 08:51:39.851434', 'step': 316, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:51:40.045517', 'step': 316, 'epoch': 1} {'type': 'loss', 'content': 0.35091298818588257, 'timestamp': '2025-09-05 08:51:40.047800', 'step': 317, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:40.248682', 'step': 317, 'epoch': 1} {'type': 'loss', 'content': 0.3151177763938904, 'timestamp': '2025-09-05 08:51:40.251910', 'step': 318, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:40.448949', 'step': 318, 'epoch': 1} {'type': 'loss', 'content': 0.2805836796760559, 'timestamp': '2025-09-05 08:51:40.453248', 'step': 319, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:40.664741', 'step': 319, 'epoch': 1} {'type': 'loss', 'content': 0.31773483753204346, 'timestamp': '2025-09-05 08:51:40.679760', 'step': 320, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:51:45.519892', 'step': 320, 'epoch': 1} {'type': 'pplx', 'content': 62.03072212739932, 'timestamp': '2025-09-05 08:51:45.522026', 'step': 320, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 320', 'timestamp': '2025-09-05 08:51:45.987925', 'step': 320, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:46.160525', 'step': 320, 'epoch': 1} {'type': 'loss', 'content': 0.287524551153183, 'timestamp': '2025-09-05 08:51:46.164165', 'step': 321, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:46.367278', 'step': 321, 'epoch': 1} {'type': 'loss', 'content': 0.356285959482193, 'timestamp': '2025-09-05 08:51:46.369491', 'step': 322, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:46.578658', 'step': 322, 'epoch': 1} {'type': 'loss', 'content': 0.5080327987670898, 'timestamp': '2025-09-05 08:51:46.581762', 'step': 323, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:46.786024', 'step': 323, 'epoch': 1} {'type': 'loss', 'content': 0.3835434019565582, 'timestamp': '2025-09-05 08:51:46.803007', 'step': 324, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:47.003397', 'step': 324, 'epoch': 1} {'type': 'loss', 'content': 0.3160017430782318, 'timestamp': '2025-09-05 08:51:47.006010', 'step': 325, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:47.207660', 'step': 325, 'epoch': 1} {'type': 'loss', 'content': 0.4292439818382263, 'timestamp': '2025-09-05 08:51:47.210036', 'step': 326, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 08:51:47.416253', 'step': 326, 'epoch': 1} {'type': 'loss', 'content': 0.5584763884544373, 'timestamp': '2025-09-05 08:51:47.418687', 'step': 327, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:47.628242', 'step': 327, 'epoch': 1} {'type': 'loss', 'content': 0.2741956412792206, 'timestamp': '2025-09-05 08:51:47.645050', 'step': 328, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:51:47.846435', 'step': 328, 'epoch': 1} {'type': 'loss', 'content': 0.45366376638412476, 'timestamp': '2025-09-05 08:51:47.849132', 'step': 329, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:48.049050', 'step': 329, 'epoch': 1} {'type': 'loss', 'content': 0.4707634449005127, 'timestamp': '2025-09-05 08:51:48.051331', 'step': 330, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:48.252308', 'step': 330, 'epoch': 1} {'type': 'loss', 'content': 0.32083678245544434, 'timestamp': '2025-09-05 08:51:48.254709', 'step': 331, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:48.453175', 'step': 331, 'epoch': 1} {'type': 'loss', 'content': 0.23594726622104645, 'timestamp': '2025-09-05 08:51:48.469797', 'step': 332, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:48.670973', 'step': 332, 'epoch': 1} {'type': 'loss', 'content': 0.342578649520874, 'timestamp': '2025-09-05 08:51:48.673988', 'step': 333, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:48.884254', 'step': 333, 'epoch': 1} {'type': 'loss', 'content': 0.3945893943309784, 'timestamp': '2025-09-05 08:51:48.886685', 'step': 334, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:49.086662', 'step': 334, 'epoch': 1} {'type': 'loss', 'content': 0.2937992811203003, 'timestamp': '2025-09-05 08:51:49.090876', 'step': 335, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:49.288803', 'step': 335, 'epoch': 1} {'type': 'loss', 'content': 0.3234483003616333, 'timestamp': '2025-09-05 08:51:49.305690', 'step': 336, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:49.506331', 'step': 336, 'epoch': 1} {'type': 'loss', 'content': 0.3142837584018707, 'timestamp': '2025-09-05 08:51:49.508588', 'step': 337, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:51:49.706321', 'step': 337, 'epoch': 1} {'type': 'loss', 'content': 0.29564931988716125, 'timestamp': '2025-09-05 08:51:49.712194', 'step': 338, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:49.919033', 'step': 338, 'epoch': 1} {'type': 'loss', 'content': 0.2818536162376404, 'timestamp': '2025-09-05 08:51:49.921396', 'step': 339, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:50.129125', 'step': 339, 'epoch': 1} {'type': 'loss', 'content': 0.31897684931755066, 'timestamp': '2025-09-05 08:51:50.145532', 'step': 340, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:51:54.884325', 'step': 340, 'epoch': 1} {'type': 'pplx', 'content': 61.25419831481063, 'timestamp': '2025-09-05 08:51:54.886751', 'step': 340, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:55.051072', 'step': 340, 'epoch': 1} {'type': 'loss', 'content': 0.3685891926288605, 'timestamp': '2025-09-05 08:51:55.054219', 'step': 341, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:51:55.265043', 'step': 341, 'epoch': 1} {'type': 'loss', 'content': 0.4979967176914215, 'timestamp': '2025-09-05 08:51:55.267081', 'step': 342, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:55.467739', 'step': 342, 'epoch': 1} {'type': 'loss', 'content': 0.4489175081253052, 'timestamp': '2025-09-05 08:51:55.470109', 'step': 343, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:55.671335', 'step': 343, 'epoch': 1} {'type': 'loss', 'content': 0.42277976870536804, 'timestamp': '2025-09-05 08:51:55.686218', 'step': 344, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:55.877423', 'step': 344, 'epoch': 1} {'type': 'loss', 'content': 0.26805245876312256, 'timestamp': '2025-09-05 08:51:55.881086', 'step': 345, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:51:56.083943', 'step': 345, 'epoch': 1} {'type': 'loss', 'content': 0.4023347496986389, 'timestamp': '2025-09-05 08:51:56.086884', 'step': 346, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:56.285593', 'step': 346, 'epoch': 1} {'type': 'loss', 'content': 0.21793437004089355, 'timestamp': '2025-09-05 08:51:56.287738', 'step': 347, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:56.497434', 'step': 347, 'epoch': 1} {'type': 'loss', 'content': 0.42367270588874817, 'timestamp': '2025-09-05 08:51:56.514045', 'step': 348, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:56.714891', 'step': 348, 'epoch': 1} {'type': 'loss', 'content': 0.19184860587120056, 'timestamp': '2025-09-05 08:51:56.718750', 'step': 349, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:56.916934', 'step': 349, 'epoch': 1} {'type': 'loss', 'content': 0.4862029254436493, 'timestamp': '2025-09-05 08:51:56.919071', 'step': 350, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:57.119662', 'step': 350, 'epoch': 1} {'type': 'loss', 'content': 0.39638984203338623, 'timestamp': '2025-09-05 08:51:57.123211', 'step': 351, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:57.321992', 'step': 351, 'epoch': 1} {'type': 'loss', 'content': 0.44743719696998596, 'timestamp': '2025-09-05 08:51:57.337160', 'step': 352, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:57.531589', 'step': 352, 'epoch': 1} {'type': 'loss', 'content': 0.3436514139175415, 'timestamp': '2025-09-05 08:51:57.533859', 'step': 353, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:57.733979', 'step': 353, 'epoch': 1} {'type': 'loss', 'content': 0.5128458142280579, 'timestamp': '2025-09-05 08:51:57.736308', 'step': 354, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:57.936356', 'step': 354, 'epoch': 1} {'type': 'loss', 'content': 0.38132208585739136, 'timestamp': '2025-09-05 08:51:57.938784', 'step': 355, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:58.151174', 'step': 355, 'epoch': 1} {'type': 'loss', 'content': 0.4043236970901489, 'timestamp': '2025-09-05 08:51:58.166887', 'step': 356, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:51:58.368726', 'step': 356, 'epoch': 1} {'type': 'loss', 'content': 0.4835319221019745, 'timestamp': '2025-09-05 08:51:58.371395', 'step': 357, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:51:58.579671', 'step': 357, 'epoch': 1} {'type': 'loss', 'content': 0.4353181719779968, 'timestamp': '2025-09-05 08:51:58.581758', 'step': 358, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:51:58.792551', 'step': 358, 'epoch': 1} {'type': 'loss', 'content': 0.532089352607727, 'timestamp': '2025-09-05 08:51:58.795005', 'step': 359, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:51:59.005467', 'step': 359, 'epoch': 1} {'type': 'loss', 'content': 0.36872991919517517, 'timestamp': '2025-09-05 08:51:59.022067', 'step': 360, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:52:03.741679', 'step': 360, 'epoch': 1} {'type': 'pplx', 'content': 60.85764402253067, 'timestamp': '2025-09-05 08:52:03.744697', 'step': 360, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 360', 'timestamp': '2025-09-05 08:52:04.267463', 'step': 360, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:04.467076', 'step': 360, 'epoch': 1} {'type': 'loss', 'content': 0.4337652027606964, 'timestamp': '2025-09-05 08:52:04.469790', 'step': 361, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:04.669343', 'step': 361, 'epoch': 1} {'type': 'loss', 'content': 0.3073671758174896, 'timestamp': '2025-09-05 08:52:04.672067', 'step': 362, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:04.870495', 'step': 362, 'epoch': 1} {'type': 'loss', 'content': 0.3424108922481537, 'timestamp': '2025-09-05 08:52:04.872776', 'step': 363, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:05.080213', 'step': 363, 'epoch': 1} {'type': 'loss', 'content': 0.4130728542804718, 'timestamp': '2025-09-05 08:52:05.097032', 'step': 364, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:05.296805', 'step': 364, 'epoch': 1} {'type': 'loss', 'content': 0.5122348666191101, 'timestamp': '2025-09-05 08:52:05.299866', 'step': 365, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:05.512098', 'step': 365, 'epoch': 1} {'type': 'loss', 'content': 0.35365021228790283, 'timestamp': '2025-09-05 08:52:05.514520', 'step': 366, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:05.725698', 'step': 366, 'epoch': 1} {'type': 'loss', 'content': 0.3705803155899048, 'timestamp': '2025-09-05 08:52:05.728166', 'step': 367, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:05.927508', 'step': 367, 'epoch': 1} {'type': 'loss', 'content': 0.31572356820106506, 'timestamp': '2025-09-05 08:52:05.943003', 'step': 368, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:06.136877', 'step': 368, 'epoch': 1} {'type': 'loss', 'content': 0.484190434217453, 'timestamp': '2025-09-05 08:52:06.139123', 'step': 369, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:06.338365', 'step': 369, 'epoch': 1} {'type': 'loss', 'content': 0.33856910467147827, 'timestamp': '2025-09-05 08:52:06.340412', 'step': 370, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:52:06.548382', 'step': 370, 'epoch': 1} {'type': 'loss', 'content': 0.33974528312683105, 'timestamp': '2025-09-05 08:52:06.551882', 'step': 371, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:06.725021', 'step': 371, 'epoch': 1} {'type': 'loss', 'content': 0.28406357765197754, 'timestamp': '2025-09-05 08:52:06.734842', 'step': 372, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:06.902028', 'step': 372, 'epoch': 1} {'type': 'loss', 'content': 0.4505447447299957, 'timestamp': '2025-09-05 08:52:06.904266', 'step': 373, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:07.072962', 'step': 373, 'epoch': 1} {'type': 'loss', 'content': 0.41192686557769775, 'timestamp': '2025-09-05 08:52:07.075418', 'step': 374, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:07.259228', 'step': 374, 'epoch': 1} {'type': 'loss', 'content': 0.48100942373275757, 'timestamp': '2025-09-05 08:52:07.261588', 'step': 375, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:07.442104', 'step': 375, 'epoch': 1} {'type': 'loss', 'content': 0.4199207127094269, 'timestamp': '2025-09-05 08:52:07.451785', 'step': 376, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:07.618469', 'step': 376, 'epoch': 1} {'type': 'loss', 'content': 0.44891834259033203, 'timestamp': '2025-09-05 08:52:07.620478', 'step': 377, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:07.794549', 'step': 377, 'epoch': 1} {'type': 'loss', 'content': 0.38180747628211975, 'timestamp': '2025-09-05 08:52:07.796896', 'step': 378, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:07.967300', 'step': 378, 'epoch': 1} {'type': 'loss', 'content': 0.4493151307106018, 'timestamp': '2025-09-05 08:52:07.970670', 'step': 379, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:08.151498', 'step': 379, 'epoch': 1} {'type': 'loss', 'content': 0.3687281906604767, 'timestamp': '2025-09-05 08:52:08.161079', 'step': 380, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:52:12.875645', 'step': 380, 'epoch': 1} {'type': 'pplx', 'content': 60.50173690886968, 'timestamp': '2025-09-05 08:52:12.879267', 'step': 380, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:13.045039', 'step': 380, 'epoch': 1} {'type': 'loss', 'content': 0.24862390756607056, 'timestamp': '2025-09-05 08:52:13.050670', 'step': 381, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:13.251042', 'step': 381, 'epoch': 1} {'type': 'loss', 'content': 0.3486191928386688, 'timestamp': '2025-09-05 08:52:13.253890', 'step': 382, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:13.465124', 'step': 382, 'epoch': 1} {'type': 'loss', 'content': 0.3658300042152405, 'timestamp': '2025-09-05 08:52:13.467841', 'step': 383, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:13.675325', 'step': 383, 'epoch': 1} {'type': 'loss', 'content': 0.31089478731155396, 'timestamp': '2025-09-05 08:52:13.693652', 'step': 384, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:13.879125', 'step': 384, 'epoch': 1} {'type': 'loss', 'content': 0.3752908408641815, 'timestamp': '2025-09-05 08:52:13.881370', 'step': 385, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:14.090355', 'step': 385, 'epoch': 1} {'type': 'loss', 'content': 0.5100250244140625, 'timestamp': '2025-09-05 08:52:14.092533', 'step': 386, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:14.301671', 'step': 386, 'epoch': 1} {'type': 'loss', 'content': 0.5159087181091309, 'timestamp': '2025-09-05 08:52:14.307073', 'step': 387, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:14.505032', 'step': 387, 'epoch': 1} {'type': 'loss', 'content': 0.4958210289478302, 'timestamp': '2025-09-05 08:52:14.527608', 'step': 388, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:14.729709', 'step': 388, 'epoch': 1} {'type': 'loss', 'content': 0.47933509945869446, 'timestamp': '2025-09-05 08:52:14.732227', 'step': 389, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:14.931821', 'step': 389, 'epoch': 1} {'type': 'loss', 'content': 0.4100204110145569, 'timestamp': '2025-09-05 08:52:14.936678', 'step': 390, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:15.148605', 'step': 390, 'epoch': 1} {'type': 'loss', 'content': 0.4352912902832031, 'timestamp': '2025-09-05 08:52:15.151091', 'step': 391, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:15.349293', 'step': 391, 'epoch': 1} {'type': 'loss', 'content': 0.39970317482948303, 'timestamp': '2025-09-05 08:52:15.365860', 'step': 392, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:15.567312', 'step': 392, 'epoch': 1} {'type': 'loss', 'content': 0.3486652076244354, 'timestamp': '2025-09-05 08:52:15.569587', 'step': 393, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:15.779892', 'step': 393, 'epoch': 1} {'type': 'loss', 'content': 0.4482623040676117, 'timestamp': '2025-09-05 08:52:15.781888', 'step': 394, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:52:15.955997', 'step': 394, 'epoch': 1} {'type': 'loss', 'content': 0.3497057557106018, 'timestamp': '2025-09-05 08:52:15.959493', 'step': 395, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:16.167562', 'step': 395, 'epoch': 1} {'type': 'loss', 'content': 0.2155306488275528, 'timestamp': '2025-09-05 08:52:16.181975', 'step': 396, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:16.374926', 'step': 396, 'epoch': 1} {'type': 'loss', 'content': 0.2984706163406372, 'timestamp': '2025-09-05 08:52:16.377026', 'step': 397, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:16.578129', 'step': 397, 'epoch': 1} {'type': 'loss', 'content': 0.4520842134952545, 'timestamp': '2025-09-05 08:52:16.580222', 'step': 398, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:16.787897', 'step': 398, 'epoch': 1} {'type': 'loss', 'content': 0.4184480905532837, 'timestamp': '2025-09-05 08:52:16.791025', 'step': 399, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:17.000795', 'step': 399, 'epoch': 1} {'type': 'loss', 'content': 0.391579806804657, 'timestamp': '2025-09-05 08:52:17.016163', 'step': 400, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:52:21.710130', 'step': 400, 'epoch': 1} {'type': 'pplx', 'content': 59.69678279140599, 'timestamp': '2025-09-05 08:52:21.712252', 'step': 400, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 400', 'timestamp': '2025-09-05 08:52:22.212048', 'step': 400, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:22.382310', 'step': 400, 'epoch': 1} {'type': 'loss', 'content': 0.5822342038154602, 'timestamp': '2025-09-05 08:52:22.384599', 'step': 401, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:22.583471', 'step': 401, 'epoch': 1} {'type': 'loss', 'content': 0.26364198327064514, 'timestamp': '2025-09-05 08:52:22.585575', 'step': 402, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:22.786247', 'step': 402, 'epoch': 1} {'type': 'loss', 'content': 0.415432333946228, 'timestamp': '2025-09-05 08:52:22.788568', 'step': 403, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:22.998089', 'step': 403, 'epoch': 1} {'type': 'loss', 'content': 0.43436604738235474, 'timestamp': '2025-09-05 08:52:23.014811', 'step': 404, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:23.218248', 'step': 404, 'epoch': 1} {'type': 'loss', 'content': 0.33526358008384705, 'timestamp': '2025-09-05 08:52:23.220827', 'step': 405, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:52:23.419773', 'step': 405, 'epoch': 1} {'type': 'loss', 'content': 0.3843635022640228, 'timestamp': '2025-09-05 08:52:23.423685', 'step': 406, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:23.622251', 'step': 406, 'epoch': 1} {'type': 'loss', 'content': 0.3481503427028656, 'timestamp': '2025-09-05 08:52:23.625075', 'step': 407, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:23.833551', 'step': 407, 'epoch': 1} {'type': 'loss', 'content': 0.400000661611557, 'timestamp': '2025-09-05 08:52:23.847684', 'step': 408, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:24.040538', 'step': 408, 'epoch': 1} {'type': 'loss', 'content': 0.3273821473121643, 'timestamp': '2025-09-05 08:52:24.042628', 'step': 409, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:24.251973', 'step': 409, 'epoch': 1} {'type': 'loss', 'content': 0.29011961817741394, 'timestamp': '2025-09-05 08:52:24.254121', 'step': 410, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:24.454853', 'step': 410, 'epoch': 1} {'type': 'loss', 'content': 0.4190179407596588, 'timestamp': '2025-09-05 08:52:24.457738', 'step': 411, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:24.655875', 'step': 411, 'epoch': 1} {'type': 'loss', 'content': 0.3011251389980316, 'timestamp': '2025-09-05 08:52:24.670472', 'step': 412, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:24.865142', 'step': 412, 'epoch': 1} {'type': 'loss', 'content': 0.34651055932044983, 'timestamp': '2025-09-05 08:52:24.869347', 'step': 413, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:25.068331', 'step': 413, 'epoch': 1} {'type': 'loss', 'content': 0.4374030530452728, 'timestamp': '2025-09-05 08:52:25.071673', 'step': 414, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:25.275075', 'step': 414, 'epoch': 1} {'type': 'loss', 'content': 0.3524605929851532, 'timestamp': '2025-09-05 08:52:25.279079', 'step': 415, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:25.476558', 'step': 415, 'epoch': 1} {'type': 'loss', 'content': 0.36618897318840027, 'timestamp': '2025-09-05 08:52:25.491665', 'step': 416, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:52:25.683339', 'step': 416, 'epoch': 1} {'type': 'loss', 'content': 0.3304573893547058, 'timestamp': '2025-09-05 08:52:25.685271', 'step': 417, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:25.893443', 'step': 417, 'epoch': 1} {'type': 'loss', 'content': 0.2957325577735901, 'timestamp': '2025-09-05 08:52:25.895735', 'step': 418, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:26.104484', 'step': 418, 'epoch': 1} {'type': 'loss', 'content': 0.3814864158630371, 'timestamp': '2025-09-05 08:52:26.107123', 'step': 419, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:26.312068', 'step': 419, 'epoch': 1} {'type': 'loss', 'content': 0.3151036202907562, 'timestamp': '2025-09-05 08:52:26.328851', 'step': 420, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:52:31.036153', 'step': 420, 'epoch': 1} {'type': 'pplx', 'content': 59.35530192163852, 'timestamp': '2025-09-05 08:52:31.038978', 'step': 420, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:31.210366', 'step': 420, 'epoch': 1} {'type': 'loss', 'content': 0.3436422348022461, 'timestamp': '2025-09-05 08:52:31.216851', 'step': 421, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:31.427152', 'step': 421, 'epoch': 1} {'type': 'loss', 'content': 0.24939358234405518, 'timestamp': '2025-09-05 08:52:31.429321', 'step': 422, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:31.603677', 'step': 422, 'epoch': 1} {'type': 'loss', 'content': 0.2410716712474823, 'timestamp': '2025-09-05 08:52:31.605963', 'step': 423, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:31.775227', 'step': 423, 'epoch': 1} {'type': 'loss', 'content': 0.4125831723213196, 'timestamp': '2025-09-05 08:52:31.792232', 'step': 424, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:31.995860', 'step': 424, 'epoch': 1} {'type': 'loss', 'content': 0.3474792242050171, 'timestamp': '2025-09-05 08:52:31.998205', 'step': 425, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:32.198239', 'step': 425, 'epoch': 1} {'type': 'loss', 'content': 0.31995099782943726, 'timestamp': '2025-09-05 08:52:32.200884', 'step': 426, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:32.413494', 'step': 426, 'epoch': 1} {'type': 'loss', 'content': 0.3391028046607971, 'timestamp': '2025-09-05 08:52:32.416831', 'step': 427, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:52:32.626920', 'step': 427, 'epoch': 1} {'type': 'loss', 'content': 0.38398104906082153, 'timestamp': '2025-09-05 08:52:32.643118', 'step': 428, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:32.839796', 'step': 428, 'epoch': 1} {'type': 'loss', 'content': 0.4830264151096344, 'timestamp': '2025-09-05 08:52:32.843610', 'step': 429, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:33.096484', 'step': 429, 'epoch': 1} {'type': 'loss', 'content': 0.38072633743286133, 'timestamp': '2025-09-05 08:52:33.098709', 'step': 430, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:33.350670', 'step': 430, 'epoch': 1} {'type': 'loss', 'content': 0.35430270433425903, 'timestamp': '2025-09-05 08:52:33.393391', 'step': 431, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:33.603830', 'step': 431, 'epoch': 1} {'type': 'loss', 'content': 0.33442458510398865, 'timestamp': '2025-09-05 08:52:33.618554', 'step': 432, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:33.810928', 'step': 432, 'epoch': 1} {'type': 'loss', 'content': 0.48529914021492004, 'timestamp': '2025-09-05 08:52:33.814095', 'step': 433, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:34.016352', 'step': 433, 'epoch': 1} {'type': 'loss', 'content': 0.414386510848999, 'timestamp': '2025-09-05 08:52:34.018901', 'step': 434, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:34.228728', 'step': 434, 'epoch': 1} {'type': 'loss', 'content': 0.5347086191177368, 'timestamp': '2025-09-05 08:52:34.231277', 'step': 435, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:34.430951', 'step': 435, 'epoch': 1} {'type': 'loss', 'content': 0.3398420214653015, 'timestamp': '2025-09-05 08:52:34.446031', 'step': 436, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:34.681402', 'step': 436, 'epoch': 1} {'type': 'loss', 'content': 0.33175885677337646, 'timestamp': '2025-09-05 08:52:34.683884', 'step': 437, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:34.891436', 'step': 437, 'epoch': 1} {'type': 'loss', 'content': 0.30734267830848694, 'timestamp': '2025-09-05 08:52:34.894720', 'step': 438, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:35.093238', 'step': 438, 'epoch': 1} {'type': 'loss', 'content': 0.2820775508880615, 'timestamp': '2025-09-05 08:52:35.096035', 'step': 439, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:35.389076', 'step': 439, 'epoch': 1} {'type': 'loss', 'content': 0.3953246772289276, 'timestamp': '2025-09-05 08:52:35.405738', 'step': 440, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:52:40.619422', 'step': 440, 'epoch': 1} {'type': 'pplx', 'content': 59.06938762912424, 'timestamp': '2025-09-05 08:52:40.622034', 'step': 440, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 440', 'timestamp': '2025-09-05 08:52:41.124812', 'step': 440, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:41.337105', 'step': 440, 'epoch': 1} {'type': 'loss', 'content': 0.428076833486557, 'timestamp': '2025-09-05 08:52:41.362204', 'step': 441, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:41.616790', 'step': 441, 'epoch': 1} {'type': 'loss', 'content': 0.30740123987197876, 'timestamp': '2025-09-05 08:52:41.619651', 'step': 442, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:41.829409', 'step': 442, 'epoch': 1} {'type': 'loss', 'content': 0.3571283221244812, 'timestamp': '2025-09-05 08:52:41.831738', 'step': 443, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:42.030375', 'step': 443, 'epoch': 1} {'type': 'loss', 'content': 0.3677189350128174, 'timestamp': '2025-09-05 08:52:42.044872', 'step': 444, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:42.237303', 'step': 444, 'epoch': 1} {'type': 'loss', 'content': 0.2622219920158386, 'timestamp': '2025-09-05 08:52:42.240974', 'step': 445, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:42.449712', 'step': 445, 'epoch': 1} {'type': 'loss', 'content': 0.3243364989757538, 'timestamp': '2025-09-05 08:52:42.452727', 'step': 446, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:52:42.704565', 'step': 446, 'epoch': 1} {'type': 'loss', 'content': 0.2399972379207611, 'timestamp': '2025-09-05 08:52:42.706805', 'step': 447, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:42.906947', 'step': 447, 'epoch': 1} {'type': 'loss', 'content': 0.37382861971855164, 'timestamp': '2025-09-05 08:52:42.921140', 'step': 448, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:43.115262', 'step': 448, 'epoch': 1} {'type': 'loss', 'content': 0.42507728934288025, 'timestamp': '2025-09-05 08:52:43.118670', 'step': 449, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:52:43.370405', 'step': 449, 'epoch': 1} {'type': 'loss', 'content': 0.30817678570747375, 'timestamp': '2025-09-05 08:52:43.373234', 'step': 450, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:43.575061', 'step': 450, 'epoch': 1} {'type': 'loss', 'content': 0.3689974248409271, 'timestamp': '2025-09-05 08:52:43.581882', 'step': 451, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:43.786456', 'step': 451, 'epoch': 1} {'type': 'loss', 'content': 0.3893755078315735, 'timestamp': '2025-09-05 08:52:43.803301', 'step': 452, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:44.068521', 'step': 452, 'epoch': 1} {'type': 'loss', 'content': 0.4274936020374298, 'timestamp': '2025-09-05 08:52:44.071142', 'step': 453, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:44.279260', 'step': 453, 'epoch': 1} {'type': 'loss', 'content': 0.24627014994621277, 'timestamp': '2025-09-05 08:52:44.281140', 'step': 454, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:44.488500', 'step': 454, 'epoch': 1} {'type': 'loss', 'content': 0.42970582842826843, 'timestamp': '2025-09-05 08:52:44.495746', 'step': 455, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:44.698466', 'step': 455, 'epoch': 1} {'type': 'loss', 'content': 0.4055229127407074, 'timestamp': '2025-09-05 08:52:44.716271', 'step': 456, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:44.910316', 'step': 456, 'epoch': 1} {'type': 'loss', 'content': 0.3416372537612915, 'timestamp': '2025-09-05 08:52:44.927124', 'step': 457, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:52:45.181071', 'step': 457, 'epoch': 1} {'type': 'loss', 'content': 0.2750747799873352, 'timestamp': '2025-09-05 08:52:45.183843', 'step': 458, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:45.394187', 'step': 458, 'epoch': 1} {'type': 'loss', 'content': 0.3312303125858307, 'timestamp': '2025-09-05 08:52:45.396553', 'step': 459, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:52:45.594916', 'step': 459, 'epoch': 1} {'type': 'loss', 'content': 0.4751344919204712, 'timestamp': '2025-09-05 08:52:45.612023', 'step': 460, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:52:51.067022', 'step': 460, 'epoch': 1} {'type': 'pplx', 'content': 59.67811926915719, 'timestamp': '2025-09-05 08:52:51.069230', 'step': 460, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:51.234053', 'step': 460, 'epoch': 1} {'type': 'loss', 'content': 0.3601231575012207, 'timestamp': '2025-09-05 08:52:51.236726', 'step': 461, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:51.451554', 'step': 461, 'epoch': 1} {'type': 'loss', 'content': 0.3900015354156494, 'timestamp': '2025-09-05 08:52:51.459374', 'step': 462, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:51.668075', 'step': 462, 'epoch': 1} {'type': 'loss', 'content': 0.355674684047699, 'timestamp': '2025-09-05 08:52:51.676778', 'step': 463, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:51.889242', 'step': 463, 'epoch': 1} {'type': 'loss', 'content': 0.30925506353378296, 'timestamp': '2025-09-05 08:52:51.913685', 'step': 464, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:52.111744', 'step': 464, 'epoch': 1} {'type': 'loss', 'content': 0.41809117794036865, 'timestamp': '2025-09-05 08:52:52.114550', 'step': 465, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:52.335190', 'step': 465, 'epoch': 1} {'type': 'loss', 'content': 0.31774064898490906, 'timestamp': '2025-09-05 08:52:52.339915', 'step': 466, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:52.548704', 'step': 466, 'epoch': 1} {'type': 'loss', 'content': 0.22886255383491516, 'timestamp': '2025-09-05 08:52:52.593754', 'step': 467, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:52.803549', 'step': 467, 'epoch': 1} {'type': 'loss', 'content': 0.3445352017879486, 'timestamp': '2025-09-05 08:52:52.822863', 'step': 468, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:53.018136', 'step': 468, 'epoch': 1} {'type': 'loss', 'content': 0.41649383306503296, 'timestamp': '2025-09-05 08:52:53.020410', 'step': 469, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:53.225503', 'step': 469, 'epoch': 1} {'type': 'loss', 'content': 0.5665808320045471, 'timestamp': '2025-09-05 08:52:53.228809', 'step': 470, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:53.431781', 'step': 470, 'epoch': 1} {'type': 'loss', 'content': 0.4123092293739319, 'timestamp': '2025-09-05 08:52:53.433893', 'step': 471, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:53.636862', 'step': 471, 'epoch': 1} {'type': 'loss', 'content': 0.4420497417449951, 'timestamp': '2025-09-05 08:52:53.650920', 'step': 472, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:53.845865', 'step': 472, 'epoch': 1} {'type': 'loss', 'content': 0.35629701614379883, 'timestamp': '2025-09-05 08:52:53.850369', 'step': 473, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:54.163112', 'step': 473, 'epoch': 1} {'type': 'loss', 'content': 0.3124181926250458, 'timestamp': '2025-09-05 08:52:54.206389', 'step': 474, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:52:54.461222', 'step': 474, 'epoch': 1} {'type': 'loss', 'content': 0.5296250581741333, 'timestamp': '2025-09-05 08:52:54.463422', 'step': 475, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:52:54.680325', 'step': 475, 'epoch': 1} {'type': 'loss', 'content': 0.42064934968948364, 'timestamp': '2025-09-05 08:52:54.706467', 'step': 476, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:54.892954', 'step': 476, 'epoch': 1} {'type': 'loss', 'content': 0.37133467197418213, 'timestamp': '2025-09-05 08:52:54.898729', 'step': 477, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:52:55.097691', 'step': 477, 'epoch': 1} {'type': 'loss', 'content': 0.311069518327713, 'timestamp': '2025-09-05 08:52:55.100288', 'step': 478, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:52:55.397284', 'step': 478, 'epoch': 1} {'type': 'loss', 'content': 0.174685537815094, 'timestamp': '2025-09-05 08:52:55.401764', 'step': 479, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:52:55.657892', 'step': 479, 'epoch': 1} {'type': 'loss', 'content': 0.4376969337463379, 'timestamp': '2025-09-05 08:52:55.674695', 'step': 480, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:53:01.067886', 'step': 480, 'epoch': 1} {'type': 'pplx', 'content': 60.60817761059333, 'timestamp': '2025-09-05 08:53:01.070163', 'step': 480, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 480', 'timestamp': '2025-09-05 08:53:01.614052', 'step': 480, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:01.877643', 'step': 480, 'epoch': 1} {'type': 'loss', 'content': 0.3756929636001587, 'timestamp': '2025-09-05 08:53:01.883127', 'step': 481, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:02.085810', 'step': 481, 'epoch': 1} {'type': 'loss', 'content': 0.39133283495903015, 'timestamp': '2025-09-05 08:53:02.088041', 'step': 482, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:02.296584', 'step': 482, 'epoch': 1} {'type': 'loss', 'content': 0.2793099284172058, 'timestamp': '2025-09-05 08:53:02.303765', 'step': 483, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:53:02.505169', 'step': 483, 'epoch': 1} {'type': 'loss', 'content': 0.3743014335632324, 'timestamp': '2025-09-05 08:53:02.519161', 'step': 484, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:02.754265', 'step': 484, 'epoch': 1} {'type': 'loss', 'content': 0.3891092538833618, 'timestamp': '2025-09-05 08:53:02.798142', 'step': 485, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:03.003319', 'step': 485, 'epoch': 1} {'type': 'loss', 'content': 0.377437561750412, 'timestamp': '2025-09-05 08:53:03.005920', 'step': 486, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:03.218782', 'step': 486, 'epoch': 1} {'type': 'loss', 'content': 0.2652597725391388, 'timestamp': '2025-09-05 08:53:03.223824', 'step': 487, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:03.435831', 'step': 487, 'epoch': 1} {'type': 'loss', 'content': 0.42560693621635437, 'timestamp': '2025-09-05 08:53:03.453133', 'step': 488, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:03.677906', 'step': 488, 'epoch': 1} {'type': 'loss', 'content': 0.42667150497436523, 'timestamp': '2025-09-05 08:53:03.681745', 'step': 489, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:03.887673', 'step': 489, 'epoch': 1} {'type': 'loss', 'content': 0.21720796823501587, 'timestamp': '2025-09-05 08:53:03.890582', 'step': 490, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:53:04.088914', 'step': 490, 'epoch': 1} {'type': 'loss', 'content': 0.2846270203590393, 'timestamp': '2025-09-05 08:53:04.090855', 'step': 491, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:53:04.289239', 'step': 491, 'epoch': 1} {'type': 'loss', 'content': 0.420718789100647, 'timestamp': '2025-09-05 08:53:04.306191', 'step': 492, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:04.504965', 'step': 492, 'epoch': 1} {'type': 'loss', 'content': 0.4062350392341614, 'timestamp': '2025-09-05 08:53:04.520504', 'step': 493, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:04.812004', 'step': 493, 'epoch': 1} {'type': 'loss', 'content': 0.36269500851631165, 'timestamp': '2025-09-05 08:53:04.822831', 'step': 494, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:05.030379', 'step': 494, 'epoch': 1} {'type': 'loss', 'content': 0.40626439452171326, 'timestamp': '2025-09-05 08:53:05.032308', 'step': 495, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:05.239160', 'step': 495, 'epoch': 1} {'type': 'loss', 'content': 0.39664778113365173, 'timestamp': '2025-09-05 08:53:05.257545', 'step': 496, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:05.455355', 'step': 496, 'epoch': 1} {'type': 'loss', 'content': 0.4610913395881653, 'timestamp': '2025-09-05 08:53:05.459367', 'step': 497, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:53:05.670320', 'step': 497, 'epoch': 1} {'type': 'loss', 'content': 0.3041105568408966, 'timestamp': '2025-09-05 08:53:05.675017', 'step': 498, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:05.885044', 'step': 498, 'epoch': 1} {'type': 'loss', 'content': 0.36558741331100464, 'timestamp': '2025-09-05 08:53:05.887386', 'step': 499, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:06.094042', 'step': 499, 'epoch': 1} {'type': 'loss', 'content': 0.29611101746559143, 'timestamp': '2025-09-05 08:53:06.108327', 'step': 500, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:53:10.952066', 'step': 500, 'epoch': 1} {'type': 'pplx', 'content': 61.06419594850436, 'timestamp': '2025-09-05 08:53:10.955248', 'step': 500, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:11.116573', 'step': 500, 'epoch': 1} {'type': 'loss', 'content': 0.3699820637702942, 'timestamp': '2025-09-05 08:53:11.120663', 'step': 501, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:11.289895', 'step': 501, 'epoch': 1} {'type': 'loss', 'content': 0.3068196177482605, 'timestamp': '2025-09-05 08:53:11.292064', 'step': 502, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:11.497515', 'step': 502, 'epoch': 1} {'type': 'loss', 'content': 0.401764839887619, 'timestamp': '2025-09-05 08:53:11.499413', 'step': 503, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:11.694902', 'step': 503, 'epoch': 1} {'type': 'loss', 'content': 0.3740246295928955, 'timestamp': '2025-09-05 08:53:11.710379', 'step': 504, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:53:11.902108', 'step': 504, 'epoch': 1} {'type': 'loss', 'content': 0.3317362368106842, 'timestamp': '2025-09-05 08:53:11.904239', 'step': 505, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:12.110463', 'step': 505, 'epoch': 1} {'type': 'loss', 'content': 0.32889097929000854, 'timestamp': '2025-09-05 08:53:12.112806', 'step': 506, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:12.314073', 'step': 506, 'epoch': 1} {'type': 'loss', 'content': 0.30558574199676514, 'timestamp': '2025-09-05 08:53:12.316155', 'step': 507, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:12.524019', 'step': 507, 'epoch': 1} {'type': 'loss', 'content': 0.4227554500102997, 'timestamp': '2025-09-05 08:53:12.541949', 'step': 508, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:12.742901', 'step': 508, 'epoch': 1} {'type': 'loss', 'content': 0.31141963601112366, 'timestamp': '2025-09-05 08:53:12.745837', 'step': 509, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:12.951102', 'step': 509, 'epoch': 1} {'type': 'loss', 'content': 0.3356727659702301, 'timestamp': '2025-09-05 08:53:12.953827', 'step': 510, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:53:13.155485', 'step': 510, 'epoch': 1} {'type': 'loss', 'content': 0.27994629740715027, 'timestamp': '2025-09-05 08:53:13.158290', 'step': 511, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:13.365905', 'step': 511, 'epoch': 1} {'type': 'loss', 'content': 0.40879178047180176, 'timestamp': '2025-09-05 08:53:13.382155', 'step': 512, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:13.596253', 'step': 512, 'epoch': 1} {'type': 'loss', 'content': 0.42021042108535767, 'timestamp': '2025-09-05 08:53:13.598287', 'step': 513, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:13.810567', 'step': 513, 'epoch': 1} {'type': 'loss', 'content': 0.47443243861198425, 'timestamp': '2025-09-05 08:53:13.814212', 'step': 514, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:14.020899', 'step': 514, 'epoch': 1} {'type': 'loss', 'content': 0.47561585903167725, 'timestamp': '2025-09-05 08:53:14.024098', 'step': 515, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:14.276776', 'step': 515, 'epoch': 1} {'type': 'loss', 'content': 0.3374590277671814, 'timestamp': '2025-09-05 08:53:14.291744', 'step': 516, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:14.484079', 'step': 516, 'epoch': 1} {'type': 'loss', 'content': 0.3174070715904236, 'timestamp': '2025-09-05 08:53:14.487904', 'step': 517, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:53:14.688330', 'step': 517, 'epoch': 1} {'type': 'loss', 'content': 0.5130405426025391, 'timestamp': '2025-09-05 08:53:14.705020', 'step': 518, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:14.916635', 'step': 518, 'epoch': 1} {'type': 'loss', 'content': 0.44266101717948914, 'timestamp': '2025-09-05 08:53:14.921476', 'step': 519, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:15.124075', 'step': 519, 'epoch': 1} {'type': 'loss', 'content': 0.307713121175766, 'timestamp': '2025-09-05 08:53:15.140877', 'step': 520, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:53:20.516080', 'step': 520, 'epoch': 1} {'type': 'pplx', 'content': 60.16279337842755, 'timestamp': '2025-09-05 08:53:20.518402', 'step': 520, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 520', 'timestamp': '2025-09-05 08:53:21.039725', 'step': 520, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:21.209774', 'step': 520, 'epoch': 1} {'type': 'loss', 'content': 0.4752398729324341, 'timestamp': '2025-09-05 08:53:21.211928', 'step': 521, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:21.467367', 'step': 521, 'epoch': 1} {'type': 'loss', 'content': 0.3615381419658661, 'timestamp': '2025-09-05 08:53:21.469896', 'step': 522, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:21.740291', 'step': 522, 'epoch': 1} {'type': 'loss', 'content': 0.3906700313091278, 'timestamp': '2025-09-05 08:53:21.742462', 'step': 523, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:21.955790', 'step': 523, 'epoch': 1} {'type': 'loss', 'content': 0.40926316380500793, 'timestamp': '2025-09-05 08:53:21.972562', 'step': 524, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:22.163717', 'step': 524, 'epoch': 1} {'type': 'loss', 'content': 0.3688322603702545, 'timestamp': '2025-09-05 08:53:22.167749', 'step': 525, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:22.418586', 'step': 525, 'epoch': 1} {'type': 'loss', 'content': 0.30737367272377014, 'timestamp': '2025-09-05 08:53:22.420423', 'step': 526, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:22.627783', 'step': 526, 'epoch': 1} {'type': 'loss', 'content': 0.3638547956943512, 'timestamp': '2025-09-05 08:53:22.632266', 'step': 527, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:22.840299', 'step': 527, 'epoch': 1} {'type': 'loss', 'content': 0.3204697072505951, 'timestamp': '2025-09-05 08:53:22.858692', 'step': 528, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:53:23.129245', 'step': 528, 'epoch': 1} {'type': 'loss', 'content': 0.33960676193237305, 'timestamp': '2025-09-05 08:53:23.131263', 'step': 529, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:23.336988', 'step': 529, 'epoch': 1} {'type': 'loss', 'content': 0.26278024911880493, 'timestamp': '2025-09-05 08:53:23.343774', 'step': 530, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:23.545603', 'step': 530, 'epoch': 1} {'type': 'loss', 'content': 0.46334969997406006, 'timestamp': '2025-09-05 08:53:23.547858', 'step': 531, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:53:23.748378', 'step': 531, 'epoch': 1} {'type': 'loss', 'content': 0.3730030059814453, 'timestamp': '2025-09-05 08:53:23.763616', 'step': 532, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:23.955681', 'step': 532, 'epoch': 1} {'type': 'loss', 'content': 0.3750768303871155, 'timestamp': '2025-09-05 08:53:23.958740', 'step': 533, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:24.159708', 'step': 533, 'epoch': 1} {'type': 'loss', 'content': 0.4468510150909424, 'timestamp': '2025-09-05 08:53:24.162295', 'step': 534, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:24.358945', 'step': 534, 'epoch': 1} {'type': 'loss', 'content': 0.33164605498313904, 'timestamp': '2025-09-05 08:53:24.361476', 'step': 535, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:24.560635', 'step': 535, 'epoch': 1} {'type': 'loss', 'content': 0.41563618183135986, 'timestamp': '2025-09-05 08:53:24.583314', 'step': 536, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:53:24.783434', 'step': 536, 'epoch': 1} {'type': 'loss', 'content': 0.2689324915409088, 'timestamp': '2025-09-05 08:53:24.786279', 'step': 537, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 08:53:24.996809', 'step': 537, 'epoch': 1} {'type': 'loss', 'content': 0.4886358678340912, 'timestamp': '2025-09-05 08:53:24.998662', 'step': 538, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:53:25.202991', 'step': 538, 'epoch': 1} {'type': 'loss', 'content': 0.31535568833351135, 'timestamp': '2025-09-05 08:53:25.205119', 'step': 539, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:25.415088', 'step': 539, 'epoch': 1} {'type': 'loss', 'content': 0.30104270577430725, 'timestamp': '2025-09-05 08:53:25.430097', 'step': 540, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:53:30.312544', 'step': 540, 'epoch': 1} {'type': 'pplx', 'content': 59.515503396161826, 'timestamp': '2025-09-05 08:53:30.316745', 'step': 540, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:30.481458', 'step': 540, 'epoch': 1} {'type': 'loss', 'content': 0.38238897919654846, 'timestamp': '2025-09-05 08:53:30.483585', 'step': 541, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:30.652504', 'step': 541, 'epoch': 1} {'type': 'loss', 'content': 0.3997941017150879, 'timestamp': '2025-09-05 08:53:30.656351', 'step': 542, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:30.862613', 'step': 542, 'epoch': 1} {'type': 'loss', 'content': 0.38764917850494385, 'timestamp': '2025-09-05 08:53:30.865289', 'step': 543, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:31.115871', 'step': 543, 'epoch': 1} {'type': 'loss', 'content': 0.2606298327445984, 'timestamp': '2025-09-05 08:53:31.132908', 'step': 544, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:31.331423', 'step': 544, 'epoch': 1} {'type': 'loss', 'content': 0.4200422465801239, 'timestamp': '2025-09-05 08:53:31.333568', 'step': 545, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:31.542945', 'step': 545, 'epoch': 1} {'type': 'loss', 'content': 0.30847230553627014, 'timestamp': '2025-09-05 08:53:31.545735', 'step': 546, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:31.755943', 'step': 546, 'epoch': 1} {'type': 'loss', 'content': 0.3630848228931427, 'timestamp': '2025-09-05 08:53:31.758184', 'step': 547, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:53:31.961967', 'step': 547, 'epoch': 1} {'type': 'loss', 'content': 0.4338149428367615, 'timestamp': '2025-09-05 08:53:31.982517', 'step': 548, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:32.181564', 'step': 548, 'epoch': 1} {'type': 'loss', 'content': 0.43823084235191345, 'timestamp': '2025-09-05 08:53:32.183709', 'step': 549, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:32.393487', 'step': 549, 'epoch': 1} {'type': 'loss', 'content': 0.5095869898796082, 'timestamp': '2025-09-05 08:53:32.395847', 'step': 550, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:32.607294', 'step': 550, 'epoch': 1} {'type': 'loss', 'content': 0.4471019506454468, 'timestamp': '2025-09-05 08:53:32.609524', 'step': 551, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:32.821073', 'step': 551, 'epoch': 1} {'type': 'loss', 'content': 0.2290228307247162, 'timestamp': '2025-09-05 08:53:32.839258', 'step': 552, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:33.041860', 'step': 552, 'epoch': 1} {'type': 'loss', 'content': 0.23595690727233887, 'timestamp': '2025-09-05 08:53:33.044168', 'step': 553, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:53:33.253035', 'step': 553, 'epoch': 1} {'type': 'loss', 'content': 0.19498829543590546, 'timestamp': '2025-09-05 08:53:33.255795', 'step': 554, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:33.465968', 'step': 554, 'epoch': 1} {'type': 'loss', 'content': 0.30002859234809875, 'timestamp': '2025-09-05 08:53:33.467878', 'step': 555, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:33.681230', 'step': 555, 'epoch': 1} {'type': 'loss', 'content': 0.3351731300354004, 'timestamp': '2025-09-05 08:53:33.695980', 'step': 556, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:33.940273', 'step': 556, 'epoch': 1} {'type': 'loss', 'content': 0.3931390345096588, 'timestamp': '2025-09-05 08:53:33.943284', 'step': 557, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:34.150261', 'step': 557, 'epoch': 1} {'type': 'loss', 'content': 0.28822633624076843, 'timestamp': '2025-09-05 08:53:34.153543', 'step': 558, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:53:34.351789', 'step': 558, 'epoch': 1} {'type': 'loss', 'content': 0.32444193959236145, 'timestamp': '2025-09-05 08:53:34.354902', 'step': 559, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:34.561937', 'step': 559, 'epoch': 1} {'type': 'loss', 'content': 0.3384038209915161, 'timestamp': '2025-09-05 08:53:34.621017', 'step': 560, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:53:39.663441', 'step': 560, 'epoch': 1} {'type': 'pplx', 'content': 59.60952474637825, 'timestamp': '2025-09-05 08:53:39.666181', 'step': 560, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 560', 'timestamp': '2025-09-05 08:53:40.136439', 'step': 560, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:40.320945', 'step': 560, 'epoch': 1} {'type': 'loss', 'content': 0.2801932096481323, 'timestamp': '2025-09-05 08:53:40.323253', 'step': 561, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:40.532084', 'step': 561, 'epoch': 1} {'type': 'loss', 'content': 0.41452479362487793, 'timestamp': '2025-09-05 08:53:40.534062', 'step': 562, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:40.735016', 'step': 562, 'epoch': 1} {'type': 'loss', 'content': 0.24028798937797546, 'timestamp': '2025-09-05 08:53:40.737869', 'step': 563, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:40.937501', 'step': 563, 'epoch': 1} {'type': 'loss', 'content': 0.3565094769001007, 'timestamp': '2025-09-05 08:53:40.952391', 'step': 564, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:53:41.150448', 'step': 564, 'epoch': 1} {'type': 'loss', 'content': 0.493753045797348, 'timestamp': '2025-09-05 08:53:41.152826', 'step': 565, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:41.350962', 'step': 565, 'epoch': 1} {'type': 'loss', 'content': 0.3578570485115051, 'timestamp': '2025-09-05 08:53:41.353796', 'step': 566, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:41.562560', 'step': 566, 'epoch': 1} {'type': 'loss', 'content': 0.3159957230091095, 'timestamp': '2025-09-05 08:53:41.584877', 'step': 567, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:41.804368', 'step': 567, 'epoch': 1} {'type': 'loss', 'content': 0.3041984736919403, 'timestamp': '2025-09-05 08:53:41.819203', 'step': 568, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:42.007943', 'step': 568, 'epoch': 1} {'type': 'loss', 'content': 0.32441309094429016, 'timestamp': '2025-09-05 08:53:42.011035', 'step': 569, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:42.219961', 'step': 569, 'epoch': 1} {'type': 'loss', 'content': 0.2862820029258728, 'timestamp': '2025-09-05 08:53:42.222290', 'step': 570, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:42.424797', 'step': 570, 'epoch': 1} {'type': 'loss', 'content': 0.3482998013496399, 'timestamp': '2025-09-05 08:53:42.426971', 'step': 571, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:42.628721', 'step': 571, 'epoch': 1} {'type': 'loss', 'content': 0.31323230266571045, 'timestamp': '2025-09-05 08:53:42.646181', 'step': 572, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:42.835879', 'step': 572, 'epoch': 1} {'type': 'loss', 'content': 0.348871648311615, 'timestamp': '2025-09-05 08:53:42.837879', 'step': 573, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:43.034316', 'step': 573, 'epoch': 1} {'type': 'loss', 'content': 0.3106140196323395, 'timestamp': '2025-09-05 08:53:43.036776', 'step': 574, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:43.254863', 'step': 574, 'epoch': 1} {'type': 'loss', 'content': 0.47986507415771484, 'timestamp': '2025-09-05 08:53:43.256993', 'step': 575, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:43.455155', 'step': 575, 'epoch': 1} {'type': 'loss', 'content': 0.4856138229370117, 'timestamp': '2025-09-05 08:53:43.469870', 'step': 576, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:43.659035', 'step': 576, 'epoch': 1} {'type': 'loss', 'content': 0.23853756487369537, 'timestamp': '2025-09-05 08:53:43.661541', 'step': 577, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:43.837556', 'step': 577, 'epoch': 1} {'type': 'loss', 'content': 0.34495970606803894, 'timestamp': '2025-09-05 08:53:43.840008', 'step': 578, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:44.024806', 'step': 578, 'epoch': 1} {'type': 'loss', 'content': 0.3690643906593323, 'timestamp': '2025-09-05 08:53:44.028077', 'step': 579, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:44.212378', 'step': 579, 'epoch': 1} {'type': 'loss', 'content': 0.33360567688941956, 'timestamp': '2025-09-05 08:53:44.229347', 'step': 580, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:53:49.114108', 'step': 580, 'epoch': 1} {'type': 'pplx', 'content': 60.10085432045332, 'timestamp': '2025-09-05 08:53:49.117763', 'step': 580, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:49.263689', 'step': 580, 'epoch': 1} {'type': 'loss', 'content': 0.3735733926296234, 'timestamp': '2025-09-05 08:53:49.265593', 'step': 581, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:49.413395', 'step': 581, 'epoch': 1} {'type': 'loss', 'content': 0.4649202227592468, 'timestamp': '2025-09-05 08:53:49.420816', 'step': 582, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:53:49.651509', 'step': 582, 'epoch': 1} {'type': 'loss', 'content': 0.3592207431793213, 'timestamp': '2025-09-05 08:53:49.653601', 'step': 583, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:49.843228', 'step': 583, 'epoch': 1} {'type': 'loss', 'content': 0.45228156447410583, 'timestamp': '2025-09-05 08:53:49.861343', 'step': 584, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:50.039570', 'step': 584, 'epoch': 1} {'type': 'loss', 'content': 0.31664058566093445, 'timestamp': '2025-09-05 08:53:50.042225', 'step': 585, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:50.218017', 'step': 585, 'epoch': 1} {'type': 'loss', 'content': 0.33015117049217224, 'timestamp': '2025-09-05 08:53:50.220194', 'step': 586, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:50.400985', 'step': 586, 'epoch': 1} {'type': 'loss', 'content': 0.367570698261261, 'timestamp': '2025-09-05 08:53:50.404104', 'step': 587, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:50.611204', 'step': 587, 'epoch': 1} {'type': 'loss', 'content': 0.2882497310638428, 'timestamp': '2025-09-05 08:53:50.629492', 'step': 588, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:50.813865', 'step': 588, 'epoch': 1} {'type': 'loss', 'content': 0.37059807777404785, 'timestamp': '2025-09-05 08:53:50.817712', 'step': 589, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:51.023041', 'step': 589, 'epoch': 1} {'type': 'loss', 'content': 0.3063223361968994, 'timestamp': '2025-09-05 08:53:51.025798', 'step': 590, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:51.213430', 'step': 590, 'epoch': 1} {'type': 'loss', 'content': 0.45722252130508423, 'timestamp': '2025-09-05 08:53:51.256253', 'step': 591, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:51.435498', 'step': 591, 'epoch': 1} {'type': 'loss', 'content': 0.23291143774986267, 'timestamp': '2025-09-05 08:53:51.449993', 'step': 592, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:51.620906', 'step': 592, 'epoch': 1} {'type': 'loss', 'content': 0.3415684700012207, 'timestamp': '2025-09-05 08:53:51.624124', 'step': 593, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:51.808049', 'step': 593, 'epoch': 1} {'type': 'loss', 'content': 0.3442092537879944, 'timestamp': '2025-09-05 08:53:51.810700', 'step': 594, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:51.988789', 'step': 594, 'epoch': 1} {'type': 'loss', 'content': 0.27133557200431824, 'timestamp': '2025-09-05 08:53:51.991133', 'step': 595, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:52.243889', 'step': 595, 'epoch': 1} {'type': 'loss', 'content': 0.27585190534591675, 'timestamp': '2025-09-05 08:53:52.258810', 'step': 596, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:52.426840', 'step': 596, 'epoch': 1} {'type': 'loss', 'content': 0.41854625940322876, 'timestamp': '2025-09-05 08:53:52.428957', 'step': 597, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:52.606477', 'step': 597, 'epoch': 1} {'type': 'loss', 'content': 0.4163596034049988, 'timestamp': '2025-09-05 08:53:52.609183', 'step': 598, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:53:52.794736', 'step': 598, 'epoch': 1} {'type': 'loss', 'content': 0.24686819314956665, 'timestamp': '2025-09-05 08:53:52.797153', 'step': 599, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:53:52.993744', 'step': 599, 'epoch': 1} {'type': 'loss', 'content': 0.27659153938293457, 'timestamp': '2025-09-05 08:53:53.008373', 'step': 600, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:53:58.070605', 'step': 600, 'epoch': 1} {'type': 'pplx', 'content': 60.150732301423496, 'timestamp': '2025-09-05 08:53:58.072774', 'step': 600, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 600', 'timestamp': '2025-09-05 08:53:58.524032', 'step': 600, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:53:58.695316', 'step': 600, 'epoch': 1} {'type': 'loss', 'content': 0.36288002133369446, 'timestamp': '2025-09-05 08:53:58.697367', 'step': 601, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:58.904083', 'step': 601, 'epoch': 1} {'type': 'loss', 'content': 0.2768324017524719, 'timestamp': '2025-09-05 08:53:58.906425', 'step': 602, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:53:59.104371', 'step': 602, 'epoch': 1} {'type': 'loss', 'content': 0.3677636682987213, 'timestamp': '2025-09-05 08:53:59.106677', 'step': 603, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:59.315197', 'step': 603, 'epoch': 1} {'type': 'loss', 'content': 0.5175231099128723, 'timestamp': '2025-09-05 08:53:59.329583', 'step': 604, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:53:59.521505', 'step': 604, 'epoch': 1} {'type': 'loss', 'content': 0.30161038041114807, 'timestamp': '2025-09-05 08:53:59.524024', 'step': 605, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:53:59.720228', 'step': 605, 'epoch': 1} {'type': 'loss', 'content': 0.19638413190841675, 'timestamp': '2025-09-05 08:53:59.723539', 'step': 606, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:53:59.931213', 'step': 606, 'epoch': 1} {'type': 'loss', 'content': 0.403491348028183, 'timestamp': '2025-09-05 08:53:59.933056', 'step': 607, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:00.142014', 'step': 607, 'epoch': 1} {'type': 'loss', 'content': 0.5172940492630005, 'timestamp': '2025-09-05 08:54:00.158712', 'step': 608, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:00.360444', 'step': 608, 'epoch': 1} {'type': 'loss', 'content': 0.43559950590133667, 'timestamp': '2025-09-05 08:54:00.363569', 'step': 609, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:00.575220', 'step': 609, 'epoch': 1} {'type': 'loss', 'content': 0.39010724425315857, 'timestamp': '2025-09-05 08:54:00.578011', 'step': 610, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:00.785308', 'step': 610, 'epoch': 1} {'type': 'loss', 'content': 0.38972899317741394, 'timestamp': '2025-09-05 08:54:00.788490', 'step': 611, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:54:00.998969', 'step': 611, 'epoch': 1} {'type': 'loss', 'content': 0.26329943537712097, 'timestamp': '2025-09-05 08:54:01.015379', 'step': 612, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:01.213718', 'step': 612, 'epoch': 1} {'type': 'loss', 'content': 0.47966381907463074, 'timestamp': '2025-09-05 08:54:01.216175', 'step': 613, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:01.426283', 'step': 613, 'epoch': 1} {'type': 'loss', 'content': 0.3759992718696594, 'timestamp': '2025-09-05 08:54:01.429304', 'step': 614, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:01.648956', 'step': 614, 'epoch': 1} {'type': 'loss', 'content': 0.43966683745384216, 'timestamp': '2025-09-05 08:54:01.650896', 'step': 615, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:01.857825', 'step': 615, 'epoch': 1} {'type': 'loss', 'content': 0.2430787980556488, 'timestamp': '2025-09-05 08:54:01.872042', 'step': 616, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:02.064318', 'step': 616, 'epoch': 1} {'type': 'loss', 'content': 0.3986109793186188, 'timestamp': '2025-09-05 08:54:02.066439', 'step': 617, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:02.314801', 'step': 617, 'epoch': 1} {'type': 'loss', 'content': 0.34388166666030884, 'timestamp': '2025-09-05 08:54:02.347074', 'step': 618, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:02.556618', 'step': 618, 'epoch': 1} {'type': 'loss', 'content': 0.23649443686008453, 'timestamp': '2025-09-05 08:54:02.559154', 'step': 619, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:02.759787', 'step': 619, 'epoch': 1} {'type': 'loss', 'content': 0.3429713249206543, 'timestamp': '2025-09-05 08:54:02.776171', 'step': 620, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:54:07.720311', 'step': 620, 'epoch': 1} {'type': 'pplx', 'content': 60.0059737944772, 'timestamp': '2025-09-05 08:54:07.722616', 'step': 620, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:07.886060', 'step': 620, 'epoch': 1} {'type': 'loss', 'content': 0.38172709941864014, 'timestamp': '2025-09-05 08:54:07.888163', 'step': 621, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:08.085941', 'step': 621, 'epoch': 1} {'type': 'loss', 'content': 0.29247331619262695, 'timestamp': '2025-09-05 08:54:08.089072', 'step': 622, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:08.289376', 'step': 622, 'epoch': 1} {'type': 'loss', 'content': 0.30984073877334595, 'timestamp': '2025-09-05 08:54:08.291291', 'step': 623, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:54:08.489187', 'step': 623, 'epoch': 1} {'type': 'loss', 'content': 0.4325138032436371, 'timestamp': '2025-09-05 08:54:08.504002', 'step': 624, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:08.693990', 'step': 624, 'epoch': 1} {'type': 'loss', 'content': 0.38128820061683655, 'timestamp': '2025-09-05 08:54:08.697641', 'step': 625, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:54:08.897279', 'step': 625, 'epoch': 1} {'type': 'loss', 'content': 0.34949132800102234, 'timestamp': '2025-09-05 08:54:08.899825', 'step': 626, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:09.107520', 'step': 626, 'epoch': 1} {'type': 'loss', 'content': 0.21804000437259674, 'timestamp': '2025-09-05 08:54:09.109945', 'step': 627, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:09.318482', 'step': 627, 'epoch': 1} {'type': 'loss', 'content': 0.31279170513153076, 'timestamp': '2025-09-05 08:54:09.334969', 'step': 628, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:09.524559', 'step': 628, 'epoch': 1} {'type': 'loss', 'content': 0.4154609441757202, 'timestamp': '2025-09-05 08:54:09.527910', 'step': 629, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:09.724828', 'step': 629, 'epoch': 1} {'type': 'loss', 'content': 0.36364659667015076, 'timestamp': '2025-09-05 08:54:09.727232', 'step': 630, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:09.923511', 'step': 630, 'epoch': 1} {'type': 'loss', 'content': 0.566509485244751, 'timestamp': '2025-09-05 08:54:09.925616', 'step': 631, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:10.122125', 'step': 631, 'epoch': 1} {'type': 'loss', 'content': 0.2817482054233551, 'timestamp': '2025-09-05 08:54:10.136938', 'step': 632, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:10.328401', 'step': 632, 'epoch': 1} {'type': 'loss', 'content': 0.4373268485069275, 'timestamp': '2025-09-05 08:54:10.330653', 'step': 633, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:10.537850', 'step': 633, 'epoch': 1} {'type': 'loss', 'content': 0.2138715237379074, 'timestamp': '2025-09-05 08:54:10.540845', 'step': 634, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:10.738631', 'step': 634, 'epoch': 1} {'type': 'loss', 'content': 0.3299078643321991, 'timestamp': '2025-09-05 08:54:10.740930', 'step': 635, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:10.937830', 'step': 635, 'epoch': 1} {'type': 'loss', 'content': 0.38988572359085083, 'timestamp': '2025-09-05 08:54:10.952014', 'step': 636, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:11.142159', 'step': 636, 'epoch': 1} {'type': 'loss', 'content': 0.5013632774353027, 'timestamp': '2025-09-05 08:54:11.146725', 'step': 637, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:11.353235', 'step': 637, 'epoch': 1} {'type': 'loss', 'content': 0.25191569328308105, 'timestamp': '2025-09-05 08:54:11.355726', 'step': 638, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:11.552894', 'step': 638, 'epoch': 1} {'type': 'loss', 'content': 0.3327215313911438, 'timestamp': '2025-09-05 08:54:11.556011', 'step': 639, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:11.752920', 'step': 639, 'epoch': 1} {'type': 'loss', 'content': 0.3358171880245209, 'timestamp': '2025-09-05 08:54:11.767103', 'step': 640, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:54:16.930895', 'step': 640, 'epoch': 1} {'type': 'pplx', 'content': 60.013876298957854, 'timestamp': '2025-09-05 08:54:16.932912', 'step': 640, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 640', 'timestamp': '2025-09-05 08:54:17.406898', 'step': 640, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:17.592032', 'step': 640, 'epoch': 1} {'type': 'loss', 'content': 0.49637261033058167, 'timestamp': '2025-09-05 08:54:17.593921', 'step': 641, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:17.799004', 'step': 641, 'epoch': 1} {'type': 'loss', 'content': 0.33820703625679016, 'timestamp': '2025-09-05 08:54:17.801216', 'step': 642, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:18.013549', 'step': 642, 'epoch': 1} {'type': 'loss', 'content': 0.3308141529560089, 'timestamp': '2025-09-05 08:54:18.016186', 'step': 643, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:18.258292', 'step': 643, 'epoch': 1} {'type': 'loss', 'content': 0.40199655294418335, 'timestamp': '2025-09-05 08:54:18.273111', 'step': 644, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:18.462659', 'step': 644, 'epoch': 1} {'type': 'loss', 'content': 0.30657947063446045, 'timestamp': '2025-09-05 08:54:18.464765', 'step': 645, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:18.665532', 'step': 645, 'epoch': 1} {'type': 'loss', 'content': 0.3859306573867798, 'timestamp': '2025-09-05 08:54:18.667913', 'step': 646, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:18.876768', 'step': 646, 'epoch': 1} {'type': 'loss', 'content': 0.3470034599304199, 'timestamp': '2025-09-05 08:54:18.879140', 'step': 647, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:19.110177', 'step': 647, 'epoch': 1} {'type': 'loss', 'content': 0.3591715395450592, 'timestamp': '2025-09-05 08:54:19.125306', 'step': 648, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:19.314904', 'step': 648, 'epoch': 1} {'type': 'loss', 'content': 0.29138100147247314, 'timestamp': '2025-09-05 08:54:19.317592', 'step': 649, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:19.526396', 'step': 649, 'epoch': 1} {'type': 'loss', 'content': 0.48001521825790405, 'timestamp': '2025-09-05 08:54:19.529551', 'step': 650, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:19.734777', 'step': 650, 'epoch': 1} {'type': 'loss', 'content': 0.30436667799949646, 'timestamp': '2025-09-05 08:54:19.738102', 'step': 651, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:19.944375', 'step': 651, 'epoch': 1} {'type': 'loss', 'content': 0.41145777702331543, 'timestamp': '2025-09-05 08:54:19.960171', 'step': 652, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:54:20.150221', 'step': 652, 'epoch': 1} {'type': 'loss', 'content': 0.4586491584777832, 'timestamp': '2025-09-05 08:54:20.152746', 'step': 653, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:20.359771', 'step': 653, 'epoch': 1} {'type': 'loss', 'content': 0.2969052791595459, 'timestamp': '2025-09-05 08:54:20.362605', 'step': 654, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:20.572199', 'step': 654, 'epoch': 1} {'type': 'loss', 'content': 0.4161323010921478, 'timestamp': '2025-09-05 08:54:20.575376', 'step': 655, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:20.782477', 'step': 655, 'epoch': 1} {'type': 'loss', 'content': 0.36784666776657104, 'timestamp': '2025-09-05 08:54:20.799260', 'step': 656, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:21.003719', 'step': 656, 'epoch': 1} {'type': 'loss', 'content': 0.30674105882644653, 'timestamp': '2025-09-05 08:54:21.005922', 'step': 657, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:21.204382', 'step': 657, 'epoch': 1} {'type': 'loss', 'content': 0.3142521381378174, 'timestamp': '2025-09-05 08:54:21.207457', 'step': 658, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:21.421749', 'step': 658, 'epoch': 1} {'type': 'loss', 'content': 0.4763060212135315, 'timestamp': '2025-09-05 08:54:21.425096', 'step': 659, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:21.631974', 'step': 659, 'epoch': 1} {'type': 'loss', 'content': 0.2550380825996399, 'timestamp': '2025-09-05 08:54:21.646742', 'step': 660, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:54:26.783208', 'step': 660, 'epoch': 1} {'type': 'pplx', 'content': 59.977893482287755, 'timestamp': '2025-09-05 08:54:26.785662', 'step': 660, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:26.946480', 'step': 660, 'epoch': 1} {'type': 'loss', 'content': 0.3140474855899811, 'timestamp': '2025-09-05 08:54:26.949023', 'step': 661, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:27.117596', 'step': 661, 'epoch': 1} {'type': 'loss', 'content': 0.5123893022537231, 'timestamp': '2025-09-05 08:54:27.120661', 'step': 662, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:27.326935', 'step': 662, 'epoch': 1} {'type': 'loss', 'content': 0.3251909017562866, 'timestamp': '2025-09-05 08:54:27.328976', 'step': 663, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:27.525600', 'step': 663, 'epoch': 1} {'type': 'loss', 'content': 0.34844574332237244, 'timestamp': '2025-09-05 08:54:27.540739', 'step': 664, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:54:27.740184', 'step': 664, 'epoch': 1} {'type': 'loss', 'content': 0.4417649209499359, 'timestamp': '2025-09-05 08:54:27.823505', 'step': 665, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:28.118452', 'step': 665, 'epoch': 1} {'type': 'loss', 'content': 0.2881294786930084, 'timestamp': '2025-09-05 08:54:28.120838', 'step': 666, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:28.319720', 'step': 666, 'epoch': 1} {'type': 'loss', 'content': 0.3033196032047272, 'timestamp': '2025-09-05 08:54:28.322799', 'step': 667, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:28.532384', 'step': 667, 'epoch': 1} {'type': 'loss', 'content': 0.35287392139434814, 'timestamp': '2025-09-05 08:54:28.547149', 'step': 668, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:28.739925', 'step': 668, 'epoch': 1} {'type': 'loss', 'content': 0.3460994362831116, 'timestamp': '2025-09-05 08:54:28.816991', 'step': 669, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:29.071266', 'step': 669, 'epoch': 1} {'type': 'loss', 'content': 0.2680738866329193, 'timestamp': '2025-09-05 08:54:29.073805', 'step': 670, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:29.279231', 'step': 670, 'epoch': 1} {'type': 'loss', 'content': 0.3204239010810852, 'timestamp': '2025-09-05 08:54:29.286163', 'step': 671, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:29.489604', 'step': 671, 'epoch': 1} {'type': 'loss', 'content': 0.2820996344089508, 'timestamp': '2025-09-05 08:54:29.508998', 'step': 672, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:29.798942', 'step': 672, 'epoch': 1} {'type': 'loss', 'content': 0.3304928243160248, 'timestamp': '2025-09-05 08:54:29.801447', 'step': 673, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:30.010222', 'step': 673, 'epoch': 1} {'type': 'loss', 'content': 0.3490171730518341, 'timestamp': '2025-09-05 08:54:30.013730', 'step': 674, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:30.224578', 'step': 674, 'epoch': 1} {'type': 'loss', 'content': 0.39344292879104614, 'timestamp': '2025-09-05 08:54:30.227117', 'step': 675, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:30.423153', 'step': 675, 'epoch': 1} {'type': 'loss', 'content': 0.2898615300655365, 'timestamp': '2025-09-05 08:54:30.440851', 'step': 676, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:30.692418', 'step': 676, 'epoch': 1} {'type': 'loss', 'content': 0.4104023575782776, 'timestamp': '2025-09-05 08:54:30.696366', 'step': 677, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:30.902770', 'step': 677, 'epoch': 1} {'type': 'loss', 'content': 0.30414190888404846, 'timestamp': '2025-09-05 08:54:30.905070', 'step': 678, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:31.103059', 'step': 678, 'epoch': 1} {'type': 'loss', 'content': 0.4684240520000458, 'timestamp': '2025-09-05 08:54:31.106036', 'step': 679, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:54:31.312497', 'step': 679, 'epoch': 1} {'type': 'loss', 'content': 0.34773412346839905, 'timestamp': '2025-09-05 08:54:31.328594', 'step': 680, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:54:36.267599', 'step': 680, 'epoch': 1} {'type': 'pplx', 'content': 60.13139868121867, 'timestamp': '2025-09-05 08:54:36.271850', 'step': 680, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 680', 'timestamp': '2025-09-05 08:54:36.750503', 'step': 680, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:36.998015', 'step': 680, 'epoch': 1} {'type': 'loss', 'content': 0.3354809880256653, 'timestamp': '2025-09-05 08:54:37.000382', 'step': 681, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:37.200209', 'step': 681, 'epoch': 1} {'type': 'loss', 'content': 0.45760607719421387, 'timestamp': '2025-09-05 08:54:37.203851', 'step': 682, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:37.402177', 'step': 682, 'epoch': 1} {'type': 'loss', 'content': 0.19389431178569794, 'timestamp': '2025-09-05 08:54:37.404069', 'step': 683, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:37.603549', 'step': 683, 'epoch': 1} {'type': 'loss', 'content': 0.45813360810279846, 'timestamp': '2025-09-05 08:54:37.621567', 'step': 684, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 08:54:37.823186', 'step': 684, 'epoch': 1} {'type': 'loss', 'content': 0.4114960730075836, 'timestamp': '2025-09-05 08:54:37.825184', 'step': 685, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:38.029268', 'step': 685, 'epoch': 1} {'type': 'loss', 'content': 0.320042222738266, 'timestamp': '2025-09-05 08:54:38.032156', 'step': 686, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:54:38.228650', 'step': 686, 'epoch': 1} {'type': 'loss', 'content': 0.3017887473106384, 'timestamp': '2025-09-05 08:54:38.230931', 'step': 687, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:38.438526', 'step': 687, 'epoch': 1} {'type': 'loss', 'content': 0.19925743341445923, 'timestamp': '2025-09-05 08:54:38.455228', 'step': 688, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:38.655031', 'step': 688, 'epoch': 1} {'type': 'loss', 'content': 0.3546088635921478, 'timestamp': '2025-09-05 08:54:38.656855', 'step': 689, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:38.855417', 'step': 689, 'epoch': 1} {'type': 'loss', 'content': 0.2611202001571655, 'timestamp': '2025-09-05 08:54:38.857665', 'step': 690, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:39.064991', 'step': 690, 'epoch': 1} {'type': 'loss', 'content': 0.4317798912525177, 'timestamp': '2025-09-05 08:54:39.067201', 'step': 691, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:39.262353', 'step': 691, 'epoch': 1} {'type': 'loss', 'content': 0.3688901662826538, 'timestamp': '2025-09-05 08:54:39.279461', 'step': 692, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:39.475335', 'step': 692, 'epoch': 1} {'type': 'loss', 'content': 0.288263201713562, 'timestamp': '2025-09-05 08:54:39.477507', 'step': 693, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 08:54:39.685377', 'step': 693, 'epoch': 1} {'type': 'loss', 'content': 0.38892319798469543, 'timestamp': '2025-09-05 08:54:39.687607', 'step': 694, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:39.894436', 'step': 694, 'epoch': 1} {'type': 'loss', 'content': 0.334379643201828, 'timestamp': '2025-09-05 08:54:39.897342', 'step': 695, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:40.100425', 'step': 695, 'epoch': 1} {'type': 'loss', 'content': 0.37965139746665955, 'timestamp': '2025-09-05 08:54:40.116980', 'step': 696, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:40.314915', 'step': 696, 'epoch': 1} {'type': 'loss', 'content': 0.380183607339859, 'timestamp': '2025-09-05 08:54:40.317039', 'step': 697, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:40.516922', 'step': 697, 'epoch': 1} {'type': 'loss', 'content': 0.37185657024383545, 'timestamp': '2025-09-05 08:54:40.519343', 'step': 698, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:40.717729', 'step': 698, 'epoch': 1} {'type': 'loss', 'content': 0.3406052589416504, 'timestamp': '2025-09-05 08:54:40.720247', 'step': 699, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:40.918093', 'step': 699, 'epoch': 1} {'type': 'loss', 'content': 0.29893720149993896, 'timestamp': '2025-09-05 08:54:40.933084', 'step': 700, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:54:45.631525', 'step': 700, 'epoch': 1} {'type': 'pplx', 'content': 60.328020476201374, 'timestamp': '2025-09-05 08:54:45.634419', 'step': 700, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:45.796220', 'step': 700, 'epoch': 1} {'type': 'loss', 'content': 0.2962213158607483, 'timestamp': '2025-09-05 08:54:45.798188', 'step': 701, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:46.001282', 'step': 701, 'epoch': 1} {'type': 'loss', 'content': 0.36525845527648926, 'timestamp': '2025-09-05 08:54:46.003188', 'step': 702, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:46.201526', 'step': 702, 'epoch': 1} {'type': 'loss', 'content': 0.46742987632751465, 'timestamp': '2025-09-05 08:54:46.203916', 'step': 703, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:46.399294', 'step': 703, 'epoch': 1} {'type': 'loss', 'content': 0.26656708121299744, 'timestamp': '2025-09-05 08:54:46.416569', 'step': 704, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:46.614553', 'step': 704, 'epoch': 1} {'type': 'loss', 'content': 0.3398812711238861, 'timestamp': '2025-09-05 08:54:46.616644', 'step': 705, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:46.811613', 'step': 705, 'epoch': 1} {'type': 'loss', 'content': 0.31698310375213623, 'timestamp': '2025-09-05 08:54:46.813745', 'step': 706, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:47.010716', 'step': 706, 'epoch': 1} {'type': 'loss', 'content': 0.3885927200317383, 'timestamp': '2025-09-05 08:54:47.012461', 'step': 707, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:47.216767', 'step': 707, 'epoch': 1} {'type': 'loss', 'content': 0.26296836137771606, 'timestamp': '2025-09-05 08:54:47.230773', 'step': 708, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:47.427633', 'step': 708, 'epoch': 1} {'type': 'loss', 'content': 0.41065317392349243, 'timestamp': '2025-09-05 08:54:47.429933', 'step': 709, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:47.626215', 'step': 709, 'epoch': 1} {'type': 'loss', 'content': 0.3585359752178192, 'timestamp': '2025-09-05 08:54:47.628728', 'step': 710, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:54:47.824583', 'step': 710, 'epoch': 1} {'type': 'loss', 'content': 0.40104833245277405, 'timestamp': '2025-09-05 08:54:47.828309', 'step': 711, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:48.022731', 'step': 711, 'epoch': 1} {'type': 'loss', 'content': 0.4005032777786255, 'timestamp': '2025-09-05 08:54:48.036802', 'step': 712, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:48.224089', 'step': 712, 'epoch': 1} {'type': 'loss', 'content': 0.3810969591140747, 'timestamp': '2025-09-05 08:54:48.226238', 'step': 713, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:48.421427', 'step': 713, 'epoch': 1} {'type': 'loss', 'content': 0.4647989571094513, 'timestamp': '2025-09-05 08:54:48.423241', 'step': 714, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:48.627565', 'step': 714, 'epoch': 1} {'type': 'loss', 'content': 0.26305657625198364, 'timestamp': '2025-09-05 08:54:48.629519', 'step': 715, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:48.826866', 'step': 715, 'epoch': 1} {'type': 'loss', 'content': 0.3350170850753784, 'timestamp': '2025-09-05 08:54:48.841368', 'step': 716, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:49.030348', 'step': 716, 'epoch': 1} {'type': 'loss', 'content': 0.2926214337348938, 'timestamp': '2025-09-05 08:54:49.032122', 'step': 717, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:49.239250', 'step': 717, 'epoch': 1} {'type': 'loss', 'content': 0.4400876760482788, 'timestamp': '2025-09-05 08:54:49.241213', 'step': 718, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:49.448651', 'step': 718, 'epoch': 1} {'type': 'loss', 'content': 0.2703559994697571, 'timestamp': '2025-09-05 08:54:49.450304', 'step': 719, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:49.657250', 'step': 719, 'epoch': 1} {'type': 'loss', 'content': 0.1777232140302658, 'timestamp': '2025-09-05 08:54:49.673678', 'step': 720, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:54:54.347893', 'step': 720, 'epoch': 1} {'type': 'pplx', 'content': 60.46760355156991, 'timestamp': '2025-09-05 08:54:54.349759', 'step': 720, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 720', 'timestamp': '2025-09-05 08:54:54.823525', 'step': 720, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:55.017249', 'step': 720, 'epoch': 1} {'type': 'loss', 'content': 0.3396753668785095, 'timestamp': '2025-09-05 08:54:55.019130', 'step': 721, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:55.227345', 'step': 721, 'epoch': 1} {'type': 'loss', 'content': 0.45180007815361023, 'timestamp': '2025-09-05 08:54:55.228947', 'step': 722, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:55.424212', 'step': 722, 'epoch': 1} {'type': 'loss', 'content': 0.43839141726493835, 'timestamp': '2025-09-05 08:54:55.426285', 'step': 723, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:55.623172', 'step': 723, 'epoch': 1} {'type': 'loss', 'content': 0.38653379678726196, 'timestamp': '2025-09-05 08:54:55.640180', 'step': 724, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:55.831590', 'step': 724, 'epoch': 1} {'type': 'loss', 'content': 0.302320271730423, 'timestamp': '2025-09-05 08:54:55.833560', 'step': 725, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:56.038980', 'step': 725, 'epoch': 1} {'type': 'loss', 'content': 0.3913811147212982, 'timestamp': '2025-09-05 08:54:56.040877', 'step': 726, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:56.246521', 'step': 726, 'epoch': 1} {'type': 'loss', 'content': 0.2894538640975952, 'timestamp': '2025-09-05 08:54:56.248271', 'step': 727, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:56.452672', 'step': 727, 'epoch': 1} {'type': 'loss', 'content': 0.29082658886909485, 'timestamp': '2025-09-05 08:54:56.469163', 'step': 728, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:56.664440', 'step': 728, 'epoch': 1} {'type': 'loss', 'content': 0.4690100848674774, 'timestamp': '2025-09-05 08:54:56.667290', 'step': 729, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:56.863708', 'step': 729, 'epoch': 1} {'type': 'loss', 'content': 0.4146786332130432, 'timestamp': '2025-09-05 08:54:56.866143', 'step': 730, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:54:57.061283', 'step': 730, 'epoch': 1} {'type': 'loss', 'content': 0.4398282766342163, 'timestamp': '2025-09-05 08:54:57.063210', 'step': 731, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:57.259524', 'step': 731, 'epoch': 1} {'type': 'loss', 'content': 0.3548581600189209, 'timestamp': '2025-09-05 08:54:57.273889', 'step': 732, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:57.459570', 'step': 732, 'epoch': 1} {'type': 'loss', 'content': 0.2405700534582138, 'timestamp': '2025-09-05 08:54:57.461338', 'step': 733, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:57.657548', 'step': 733, 'epoch': 1} {'type': 'loss', 'content': 0.40556153655052185, 'timestamp': '2025-09-05 08:54:57.659320', 'step': 734, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:54:57.823137', 'step': 734, 'epoch': 1} {'type': 'loss', 'content': 0.3558599650859833, 'timestamp': '2025-09-05 08:54:57.825123', 'step': 735, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:58.028019', 'step': 735, 'epoch': 1} {'type': 'loss', 'content': 0.3774275779724121, 'timestamp': '2025-09-05 08:54:58.042770', 'step': 736, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:54:58.242072', 'step': 736, 'epoch': 1} {'type': 'loss', 'content': 0.4390884339809418, 'timestamp': '2025-09-05 08:54:58.245108', 'step': 737, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:54:58.441939', 'step': 737, 'epoch': 1} {'type': 'loss', 'content': 0.27866774797439575, 'timestamp': '2025-09-05 08:54:58.451705', 'step': 738, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:54:58.648957', 'step': 738, 'epoch': 1} {'type': 'loss', 'content': 0.2765721082687378, 'timestamp': '2025-09-05 08:54:58.653039', 'step': 739, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:54:58.848295', 'step': 739, 'epoch': 1} {'type': 'loss', 'content': 0.29264095425605774, 'timestamp': '2025-09-05 08:54:58.862799', 'step': 740, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:55:03.526058', 'step': 740, 'epoch': 1} {'type': 'pplx', 'content': 60.057011510032595, 'timestamp': '2025-09-05 08:55:03.528500', 'step': 740, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:03.689395', 'step': 740, 'epoch': 1} {'type': 'loss', 'content': 0.24096165597438812, 'timestamp': '2025-09-05 08:55:03.691725', 'step': 741, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:03.861152', 'step': 741, 'epoch': 1} {'type': 'loss', 'content': 0.276774525642395, 'timestamp': '2025-09-05 08:55:03.863850', 'step': 742, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:04.067519', 'step': 742, 'epoch': 1} {'type': 'loss', 'content': 0.4308259189128876, 'timestamp': '2025-09-05 08:55:04.069692', 'step': 743, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:55:04.275651', 'step': 743, 'epoch': 1} {'type': 'loss', 'content': 0.2998303472995758, 'timestamp': '2025-09-05 08:55:04.291124', 'step': 744, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:55:04.479757', 'step': 744, 'epoch': 1} {'type': 'loss', 'content': 0.3581094443798065, 'timestamp': '2025-09-05 08:55:04.481424', 'step': 745, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:04.686029', 'step': 745, 'epoch': 1} {'type': 'loss', 'content': 0.44069504737854004, 'timestamp': '2025-09-05 08:55:04.687941', 'step': 746, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:04.892816', 'step': 746, 'epoch': 1} {'type': 'loss', 'content': 0.2966330647468567, 'timestamp': '2025-09-05 08:55:04.894551', 'step': 747, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:05.091527', 'step': 747, 'epoch': 1} {'type': 'loss', 'content': 0.4456374943256378, 'timestamp': '2025-09-05 08:55:05.105860', 'step': 748, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:05.300254', 'step': 748, 'epoch': 1} {'type': 'loss', 'content': 0.38735565543174744, 'timestamp': '2025-09-05 08:55:05.301897', 'step': 749, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:05.496733', 'step': 749, 'epoch': 1} {'type': 'loss', 'content': 0.3685053884983063, 'timestamp': '2025-09-05 08:55:05.498685', 'step': 750, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:05.705472', 'step': 750, 'epoch': 1} {'type': 'loss', 'content': 0.3653118908405304, 'timestamp': '2025-09-05 08:55:05.707251', 'step': 751, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:05.911657', 'step': 751, 'epoch': 1} {'type': 'loss', 'content': 0.46427515149116516, 'timestamp': '2025-09-05 08:55:05.925953', 'step': 752, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:55:06.115561', 'step': 752, 'epoch': 1} {'type': 'loss', 'content': 0.33055830001831055, 'timestamp': '2025-09-05 08:55:06.117285', 'step': 753, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:06.321142', 'step': 753, 'epoch': 1} {'type': 'loss', 'content': 0.3238837718963623, 'timestamp': '2025-09-05 08:55:06.322865', 'step': 754, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:06.518158', 'step': 754, 'epoch': 1} {'type': 'loss', 'content': 0.33274197578430176, 'timestamp': '2025-09-05 08:55:06.519817', 'step': 755, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:06.716279', 'step': 755, 'epoch': 1} {'type': 'loss', 'content': 0.6049551367759705, 'timestamp': '2025-09-05 08:55:06.732935', 'step': 756, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:55:06.929062', 'step': 756, 'epoch': 1} {'type': 'loss', 'content': 0.4643118977546692, 'timestamp': '2025-09-05 08:55:06.930811', 'step': 757, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:07.096596', 'step': 757, 'epoch': 1} {'type': 'loss', 'content': 0.37005674839019775, 'timestamp': '2025-09-05 08:55:07.098944', 'step': 758, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:07.306042', 'step': 758, 'epoch': 1} {'type': 'loss', 'content': 0.270403116941452, 'timestamp': '2025-09-05 08:55:07.307925', 'step': 759, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:07.512213', 'step': 759, 'epoch': 1} {'type': 'loss', 'content': 0.3156965374946594, 'timestamp': '2025-09-05 08:55:07.521310', 'step': 760, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:55:12.156535', 'step': 760, 'epoch': 1} {'type': 'pplx', 'content': 59.66704724636567, 'timestamp': '2025-09-05 08:55:12.158506', 'step': 760, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 760', 'timestamp': '2025-09-05 08:55:12.626072', 'step': 760, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:12.793769', 'step': 760, 'epoch': 1} {'type': 'loss', 'content': 0.24948005378246307, 'timestamp': '2025-09-05 08:55:12.795644', 'step': 761, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:12.995458', 'step': 761, 'epoch': 1} {'type': 'loss', 'content': 0.33802419900894165, 'timestamp': '2025-09-05 08:55:12.997015', 'step': 762, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:13.200557', 'step': 762, 'epoch': 1} {'type': 'loss', 'content': 0.2108272910118103, 'timestamp': '2025-09-05 08:55:13.202057', 'step': 763, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:13.397743', 'step': 763, 'epoch': 1} {'type': 'loss', 'content': 0.31162819266319275, 'timestamp': '2025-09-05 08:55:13.411933', 'step': 764, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:13.598710', 'step': 764, 'epoch': 1} {'type': 'loss', 'content': 0.44521042704582214, 'timestamp': '2025-09-05 08:55:13.600271', 'step': 765, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:13.795524', 'step': 765, 'epoch': 1} {'type': 'loss', 'content': 0.3431508243083954, 'timestamp': '2025-09-05 08:55:13.797365', 'step': 766, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:14.002948', 'step': 766, 'epoch': 1} {'type': 'loss', 'content': 0.31309953331947327, 'timestamp': '2025-09-05 08:55:14.004890', 'step': 767, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:14.199262', 'step': 767, 'epoch': 1} {'type': 'loss', 'content': 0.2508161664009094, 'timestamp': '2025-09-05 08:55:14.216268', 'step': 768, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:14.416044', 'step': 768, 'epoch': 1} {'type': 'loss', 'content': 0.35422399640083313, 'timestamp': '2025-09-05 08:55:14.417865', 'step': 769, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:14.625474', 'step': 769, 'epoch': 1} {'type': 'loss', 'content': 0.39631184935569763, 'timestamp': '2025-09-05 08:55:14.627596', 'step': 770, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:55:14.834274', 'step': 770, 'epoch': 1} {'type': 'loss', 'content': 0.35908469557762146, 'timestamp': '2025-09-05 08:55:14.836925', 'step': 771, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:15.033942', 'step': 771, 'epoch': 1} {'type': 'loss', 'content': 0.44913047552108765, 'timestamp': '2025-09-05 08:55:15.050281', 'step': 772, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:55:15.248779', 'step': 772, 'epoch': 1} {'type': 'loss', 'content': 0.35234954953193665, 'timestamp': '2025-09-05 08:55:15.250645', 'step': 773, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:15.455657', 'step': 773, 'epoch': 1} {'type': 'loss', 'content': 0.26836010813713074, 'timestamp': '2025-09-05 08:55:15.457801', 'step': 774, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:15.655351', 'step': 774, 'epoch': 1} {'type': 'loss', 'content': 0.4194760024547577, 'timestamp': '2025-09-05 08:55:15.657289', 'step': 775, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:15.857106', 'step': 775, 'epoch': 1} {'type': 'loss', 'content': 0.4029027223587036, 'timestamp': '2025-09-05 08:55:15.871310', 'step': 776, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:16.067439', 'step': 776, 'epoch': 1} {'type': 'loss', 'content': 0.3827188014984131, 'timestamp': '2025-09-05 08:55:16.069132', 'step': 777, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:16.274296', 'step': 777, 'epoch': 1} {'type': 'loss', 'content': 0.3042537569999695, 'timestamp': '2025-09-05 08:55:16.276538', 'step': 778, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:16.485146', 'step': 778, 'epoch': 1} {'type': 'loss', 'content': 0.3845559358596802, 'timestamp': '2025-09-05 08:55:16.486916', 'step': 779, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:16.682919', 'step': 779, 'epoch': 1} {'type': 'loss', 'content': 0.3793331980705261, 'timestamp': '2025-09-05 08:55:16.696900', 'step': 780, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:55:21.332508', 'step': 780, 'epoch': 1} {'type': 'pplx', 'content': 59.53960384330279, 'timestamp': '2025-09-05 08:55:21.334280', 'step': 780, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:21.495075', 'step': 780, 'epoch': 1} {'type': 'loss', 'content': 0.42627277970314026, 'timestamp': '2025-09-05 08:55:21.497499', 'step': 781, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:21.665091', 'step': 781, 'epoch': 1} {'type': 'loss', 'content': 0.2579745650291443, 'timestamp': '2025-09-05 08:55:21.666790', 'step': 782, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:21.872172', 'step': 782, 'epoch': 1} {'type': 'loss', 'content': 0.2808438539505005, 'timestamp': '2025-09-05 08:55:21.874532', 'step': 783, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:22.070858', 'step': 783, 'epoch': 1} {'type': 'loss', 'content': 0.2570682764053345, 'timestamp': '2025-09-05 08:55:22.080329', 'step': 784, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:22.242021', 'step': 784, 'epoch': 1} {'type': 'loss', 'content': 0.39906033873558044, 'timestamp': '2025-09-05 08:55:22.243659', 'step': 785, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:22.450233', 'step': 785, 'epoch': 1} {'type': 'loss', 'content': 0.288041889667511, 'timestamp': '2025-09-05 08:55:22.452889', 'step': 786, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:22.621107', 'step': 786, 'epoch': 1} {'type': 'loss', 'content': 0.3034781217575073, 'timestamp': '2025-09-05 08:55:22.623097', 'step': 787, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:22.817939', 'step': 787, 'epoch': 1} {'type': 'loss', 'content': 0.3924509584903717, 'timestamp': '2025-09-05 08:55:22.827178', 'step': 788, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:55:22.991967', 'step': 788, 'epoch': 1} {'type': 'loss', 'content': 0.3101724684238434, 'timestamp': '2025-09-05 08:55:22.993571', 'step': 789, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:23.198900', 'step': 789, 'epoch': 1} {'type': 'loss', 'content': 0.35364800691604614, 'timestamp': '2025-09-05 08:55:23.200466', 'step': 790, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:23.395760', 'step': 790, 'epoch': 1} {'type': 'loss', 'content': 0.29069453477859497, 'timestamp': '2025-09-05 08:55:23.398959', 'step': 791, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:23.595445', 'step': 791, 'epoch': 1} {'type': 'loss', 'content': 0.2604013979434967, 'timestamp': '2025-09-05 08:55:23.611776', 'step': 792, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:23.809541', 'step': 792, 'epoch': 1} {'type': 'loss', 'content': 0.3716432452201843, 'timestamp': '2025-09-05 08:55:23.811288', 'step': 793, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:24.006648', 'step': 793, 'epoch': 1} {'type': 'loss', 'content': 0.2955947518348694, 'timestamp': '2025-09-05 08:55:24.008408', 'step': 794, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:24.204286', 'step': 794, 'epoch': 1} {'type': 'loss', 'content': 0.4398670792579651, 'timestamp': '2025-09-05 08:55:24.206033', 'step': 795, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:24.400419', 'step': 795, 'epoch': 1} {'type': 'loss', 'content': 0.39011335372924805, 'timestamp': '2025-09-05 08:55:24.414895', 'step': 796, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:55:24.610922', 'step': 796, 'epoch': 1} {'type': 'loss', 'content': 0.4034815728664398, 'timestamp': '2025-09-05 08:55:24.614525', 'step': 797, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:24.815788', 'step': 797, 'epoch': 1} {'type': 'loss', 'content': 0.33042198419570923, 'timestamp': '2025-09-05 08:55:24.817431', 'step': 798, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:25.022565', 'step': 798, 'epoch': 1} {'type': 'loss', 'content': 0.35239022970199585, 'timestamp': '2025-09-05 08:55:25.024113', 'step': 799, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:25.229432', 'step': 799, 'epoch': 1} {'type': 'loss', 'content': 0.38791370391845703, 'timestamp': '2025-09-05 08:55:25.243544', 'step': 800, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:55:29.889311', 'step': 800, 'epoch': 1} {'type': 'pplx', 'content': 58.95891625814123, 'timestamp': '2025-09-05 08:55:29.891231', 'step': 800, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 800', 'timestamp': '2025-09-05 08:55:30.365184', 'step': 800, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:55:30.534979', 'step': 800, 'epoch': 1} {'type': 'loss', 'content': 0.36771687865257263, 'timestamp': '2025-09-05 08:55:30.537058', 'step': 801, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:30.733797', 'step': 801, 'epoch': 1} {'type': 'loss', 'content': 0.265923410654068, 'timestamp': '2025-09-05 08:55:30.735543', 'step': 802, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:30.940549', 'step': 802, 'epoch': 1} {'type': 'loss', 'content': 0.3522961437702179, 'timestamp': '2025-09-05 08:55:30.942341', 'step': 803, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:31.108326', 'step': 803, 'epoch': 1} {'type': 'loss', 'content': 0.3711682856082916, 'timestamp': '2025-09-05 08:55:31.124440', 'step': 804, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:31.321294', 'step': 804, 'epoch': 1} {'type': 'loss', 'content': 0.282815158367157, 'timestamp': '2025-09-05 08:55:31.325387', 'step': 805, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:31.524159', 'step': 805, 'epoch': 1} {'type': 'loss', 'content': 0.3416968882083893, 'timestamp': '2025-09-05 08:55:31.527357', 'step': 806, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:31.725722', 'step': 806, 'epoch': 1} {'type': 'loss', 'content': 0.35129979252815247, 'timestamp': '2025-09-05 08:55:31.729004', 'step': 807, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:31.926674', 'step': 807, 'epoch': 1} {'type': 'loss', 'content': 0.4110635221004486, 'timestamp': '2025-09-05 08:55:31.943568', 'step': 808, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:55:32.141794', 'step': 808, 'epoch': 1} {'type': 'loss', 'content': 0.5824915766716003, 'timestamp': '2025-09-05 08:55:32.143517', 'step': 809, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:32.348023', 'step': 809, 'epoch': 1} {'type': 'loss', 'content': 0.4817580282688141, 'timestamp': '2025-09-05 08:55:32.350161', 'step': 810, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:32.555273', 'step': 810, 'epoch': 1} {'type': 'loss', 'content': 0.3244217038154602, 'timestamp': '2025-09-05 08:55:32.557016', 'step': 811, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:32.754146', 'step': 811, 'epoch': 1} {'type': 'loss', 'content': 0.377000093460083, 'timestamp': '2025-09-05 08:55:32.763538', 'step': 812, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:32.928523', 'step': 812, 'epoch': 1} {'type': 'loss', 'content': 0.40550920367240906, 'timestamp': '2025-09-05 08:55:32.930496', 'step': 813, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:33.135757', 'step': 813, 'epoch': 1} {'type': 'loss', 'content': 0.3100672662258148, 'timestamp': '2025-09-05 08:55:33.138151', 'step': 814, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:33.346041', 'step': 814, 'epoch': 1} {'type': 'loss', 'content': 0.37069234251976013, 'timestamp': '2025-09-05 08:55:33.348524', 'step': 815, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:33.553773', 'step': 815, 'epoch': 1} {'type': 'loss', 'content': 0.3425869047641754, 'timestamp': '2025-09-05 08:55:33.568801', 'step': 816, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:33.756241', 'step': 816, 'epoch': 1} {'type': 'loss', 'content': 0.4293539524078369, 'timestamp': '2025-09-05 08:55:33.758599', 'step': 817, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:33.962790', 'step': 817, 'epoch': 1} {'type': 'loss', 'content': 0.31967583298683167, 'timestamp': '2025-09-05 08:55:33.964832', 'step': 818, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:34.160870', 'step': 818, 'epoch': 1} {'type': 'loss', 'content': 0.38507017493247986, 'timestamp': '2025-09-05 08:55:34.163142', 'step': 819, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:34.333933', 'step': 819, 'epoch': 1} {'type': 'loss', 'content': 0.42399299144744873, 'timestamp': '2025-09-05 08:55:34.349042', 'step': 820, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:55:39.001578', 'step': 820, 'epoch': 1} {'type': 'pplx', 'content': 58.07706559280723, 'timestamp': '2025-09-05 08:55:39.003502', 'step': 820, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:39.163673', 'step': 820, 'epoch': 1} {'type': 'loss', 'content': 0.3387710452079773, 'timestamp': '2025-09-05 08:55:39.165694', 'step': 821, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:39.333573', 'step': 821, 'epoch': 1} {'type': 'loss', 'content': 0.2951369285583496, 'timestamp': '2025-09-05 08:55:39.335550', 'step': 822, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:39.543460', 'step': 822, 'epoch': 1} {'type': 'loss', 'content': 0.35738304257392883, 'timestamp': '2025-09-05 08:55:39.546054', 'step': 823, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:39.742891', 'step': 823, 'epoch': 1} {'type': 'loss', 'content': 0.3632713258266449, 'timestamp': '2025-09-05 08:55:39.756880', 'step': 824, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:55:39.953686', 'step': 824, 'epoch': 1} {'type': 'loss', 'content': 0.37211427092552185, 'timestamp': '2025-09-05 08:55:39.955910', 'step': 825, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:40.162235', 'step': 825, 'epoch': 1} {'type': 'loss', 'content': 0.3463480472564697, 'timestamp': '2025-09-05 08:55:40.164065', 'step': 826, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:55:40.371292', 'step': 826, 'epoch': 1} {'type': 'loss', 'content': 0.30579307675361633, 'timestamp': '2025-09-05 08:55:40.373150', 'step': 827, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:40.580090', 'step': 827, 'epoch': 1} {'type': 'loss', 'content': 0.40940576791763306, 'timestamp': '2025-09-05 08:55:40.594902', 'step': 828, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:40.792047', 'step': 828, 'epoch': 1} {'type': 'loss', 'content': 0.36023303866386414, 'timestamp': '2025-09-05 08:55:40.793964', 'step': 829, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:40.990352', 'step': 829, 'epoch': 1} {'type': 'loss', 'content': 0.3491280972957611, 'timestamp': '2025-09-05 08:55:40.992165', 'step': 830, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:41.188266', 'step': 830, 'epoch': 1} {'type': 'loss', 'content': 0.24948126077651978, 'timestamp': '2025-09-05 08:55:41.189890', 'step': 831, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:41.387216', 'step': 831, 'epoch': 1} {'type': 'loss', 'content': 0.31294217705726624, 'timestamp': '2025-09-05 08:55:41.401260', 'step': 832, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:41.590851', 'step': 832, 'epoch': 1} {'type': 'loss', 'content': 0.4010050594806671, 'timestamp': '2025-09-05 08:55:41.592639', 'step': 833, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:41.788041', 'step': 833, 'epoch': 1} {'type': 'loss', 'content': 0.3963695466518402, 'timestamp': '2025-09-05 08:55:41.790319', 'step': 834, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:41.986059', 'step': 834, 'epoch': 1} {'type': 'loss', 'content': 0.4662969708442688, 'timestamp': '2025-09-05 08:55:41.988012', 'step': 835, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:42.185669', 'step': 835, 'epoch': 1} {'type': 'loss', 'content': 0.44282209873199463, 'timestamp': '2025-09-05 08:55:42.199987', 'step': 836, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:42.388364', 'step': 836, 'epoch': 1} {'type': 'loss', 'content': 0.2699596881866455, 'timestamp': '2025-09-05 08:55:42.390059', 'step': 837, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:42.555606', 'step': 837, 'epoch': 1} {'type': 'loss', 'content': 0.42487242817878723, 'timestamp': '2025-09-05 08:55:42.557823', 'step': 838, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:42.763983', 'step': 838, 'epoch': 1} {'type': 'loss', 'content': 0.25971612334251404, 'timestamp': '2025-09-05 08:55:42.766011', 'step': 839, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:55:42.964541', 'step': 839, 'epoch': 1} {'type': 'loss', 'content': 0.3642326593399048, 'timestamp': '2025-09-05 08:55:42.978888', 'step': 840, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:55:47.651856', 'step': 840, 'epoch': 1} {'type': 'pplx', 'content': 58.15736728059577, 'timestamp': '2025-09-05 08:55:47.653828', 'step': 840, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 840', 'timestamp': '2025-09-05 08:55:48.144064', 'step': 840, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:48.312663', 'step': 840, 'epoch': 1} {'type': 'loss', 'content': 0.517951488494873, 'timestamp': '2025-09-05 08:55:48.314430', 'step': 841, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:48.508434', 'step': 841, 'epoch': 1} {'type': 'loss', 'content': 0.297713965177536, 'timestamp': '2025-09-05 08:55:48.510702', 'step': 842, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:48.707704', 'step': 842, 'epoch': 1} {'type': 'loss', 'content': 0.4420361816883087, 'timestamp': '2025-09-05 08:55:48.709846', 'step': 843, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:48.876016', 'step': 843, 'epoch': 1} {'type': 'loss', 'content': 0.4004204273223877, 'timestamp': '2025-09-05 08:55:48.892549', 'step': 844, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:49.090938', 'step': 844, 'epoch': 1} {'type': 'loss', 'content': 0.3982437551021576, 'timestamp': '2025-09-05 08:55:49.092914', 'step': 845, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:55:49.261019', 'step': 845, 'epoch': 1} {'type': 'loss', 'content': 0.36752861738204956, 'timestamp': '2025-09-05 08:55:49.265198', 'step': 846, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:49.460827', 'step': 846, 'epoch': 1} {'type': 'loss', 'content': 0.2838650941848755, 'timestamp': '2025-09-05 08:55:49.463902', 'step': 847, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:49.673060', 'step': 847, 'epoch': 1} {'type': 'loss', 'content': 0.3069932758808136, 'timestamp': '2025-09-05 08:55:49.687326', 'step': 848, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:49.877164', 'step': 848, 'epoch': 1} {'type': 'loss', 'content': 0.43903687596321106, 'timestamp': '2025-09-05 08:55:49.879086', 'step': 849, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:50.075926', 'step': 849, 'epoch': 1} {'type': 'loss', 'content': 0.2802756428718567, 'timestamp': '2025-09-05 08:55:50.078314', 'step': 850, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:50.285280', 'step': 850, 'epoch': 1} {'type': 'loss', 'content': 0.24855327606201172, 'timestamp': '2025-09-05 08:55:50.287275', 'step': 851, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:50.489334', 'step': 851, 'epoch': 1} {'type': 'loss', 'content': 0.4161975681781769, 'timestamp': '2025-09-05 08:55:50.503796', 'step': 852, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:50.695106', 'step': 852, 'epoch': 1} {'type': 'loss', 'content': 0.45736148953437805, 'timestamp': '2025-09-05 08:55:50.698191', 'step': 853, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:50.903392', 'step': 853, 'epoch': 1} {'type': 'loss', 'content': 0.36564257740974426, 'timestamp': '2025-09-05 08:55:50.905223', 'step': 854, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:51.102891', 'step': 854, 'epoch': 1} {'type': 'loss', 'content': 0.37967172265052795, 'timestamp': '2025-09-05 08:55:51.105298', 'step': 855, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:51.304529', 'step': 855, 'epoch': 1} {'type': 'loss', 'content': 0.4026603400707245, 'timestamp': '2025-09-05 08:55:51.321055', 'step': 856, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:55:51.519946', 'step': 856, 'epoch': 1} {'type': 'loss', 'content': 0.22071857750415802, 'timestamp': '2025-09-05 08:55:51.522074', 'step': 857, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:51.720784', 'step': 857, 'epoch': 1} {'type': 'loss', 'content': 0.47518184781074524, 'timestamp': '2025-09-05 08:55:51.722581', 'step': 858, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:51.928566', 'step': 858, 'epoch': 1} {'type': 'loss', 'content': 0.3092558681964874, 'timestamp': '2025-09-05 08:55:51.930334', 'step': 859, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:52.127465', 'step': 859, 'epoch': 1} {'type': 'loss', 'content': 0.2976468801498413, 'timestamp': '2025-09-05 08:55:52.142114', 'step': 860, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:55:56.850637', 'step': 860, 'epoch': 1} {'type': 'pplx', 'content': 58.433954195954186, 'timestamp': '2025-09-05 08:55:56.852647', 'step': 860, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:57.015085', 'step': 860, 'epoch': 1} {'type': 'loss', 'content': 0.3771287798881531, 'timestamp': '2025-09-05 08:55:57.016999', 'step': 861, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:57.221522', 'step': 861, 'epoch': 1} {'type': 'loss', 'content': 0.4275374710559845, 'timestamp': '2025-09-05 08:55:57.223447', 'step': 862, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:57.432116', 'step': 862, 'epoch': 1} {'type': 'loss', 'content': 0.4731064438819885, 'timestamp': '2025-09-05 08:55:57.434290', 'step': 863, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:57.632130', 'step': 863, 'epoch': 1} {'type': 'loss', 'content': 0.2372477501630783, 'timestamp': '2025-09-05 08:55:57.646792', 'step': 864, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:57.846296', 'step': 864, 'epoch': 1} {'type': 'loss', 'content': 0.328989714384079, 'timestamp': '2025-09-05 08:55:57.848392', 'step': 865, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:55:58.048431', 'step': 865, 'epoch': 1} {'type': 'loss', 'content': 0.43380752205848694, 'timestamp': '2025-09-05 08:55:58.050531', 'step': 866, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:58.248313', 'step': 866, 'epoch': 1} {'type': 'loss', 'content': 0.31248700618743896, 'timestamp': '2025-09-05 08:55:58.250155', 'step': 867, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:55:58.449096', 'step': 867, 'epoch': 1} {'type': 'loss', 'content': 0.3782794773578644, 'timestamp': '2025-09-05 08:55:58.463518', 'step': 868, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:55:58.655068', 'step': 868, 'epoch': 1} {'type': 'loss', 'content': 0.46563515067100525, 'timestamp': '2025-09-05 08:55:58.657655', 'step': 869, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:58.853817', 'step': 869, 'epoch': 1} {'type': 'loss', 'content': 0.30388137698173523, 'timestamp': '2025-09-05 08:55:58.855603', 'step': 870, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:59.065956', 'step': 870, 'epoch': 1} {'type': 'loss', 'content': 0.27287933230400085, 'timestamp': '2025-09-05 08:55:59.067801', 'step': 871, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:55:59.267381', 'step': 871, 'epoch': 1} {'type': 'loss', 'content': 0.23755168914794922, 'timestamp': '2025-09-05 08:55:59.284414', 'step': 872, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:59.482355', 'step': 872, 'epoch': 1} {'type': 'loss', 'content': 0.37823018431663513, 'timestamp': '2025-09-05 08:55:59.484362', 'step': 873, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:55:59.690986', 'step': 873, 'epoch': 1} {'type': 'loss', 'content': 0.4183447062969208, 'timestamp': '2025-09-05 08:55:59.694146', 'step': 874, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:55:59.902530', 'step': 874, 'epoch': 1} {'type': 'loss', 'content': 0.36634987592697144, 'timestamp': '2025-09-05 08:55:59.904278', 'step': 875, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:00.112019', 'step': 875, 'epoch': 1} {'type': 'loss', 'content': 0.5448222756385803, 'timestamp': '2025-09-05 08:56:00.126417', 'step': 876, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:00.323866', 'step': 876, 'epoch': 1} {'type': 'loss', 'content': 0.38981127738952637, 'timestamp': '2025-09-05 08:56:00.325658', 'step': 877, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:00.533301', 'step': 877, 'epoch': 1} {'type': 'loss', 'content': 0.3060409724712372, 'timestamp': '2025-09-05 08:56:00.535138', 'step': 878, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:00.732164', 'step': 878, 'epoch': 1} {'type': 'loss', 'content': 0.23060426115989685, 'timestamp': '2025-09-05 08:56:00.733955', 'step': 879, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:00.930960', 'step': 879, 'epoch': 1} {'type': 'loss', 'content': 0.3777828514575958, 'timestamp': '2025-09-05 08:56:00.945178', 'step': 880, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:56:05.651096', 'step': 880, 'epoch': 1} {'type': 'pplx', 'content': 58.9930927717275, 'timestamp': '2025-09-05 08:56:05.653602', 'step': 880, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 880', 'timestamp': '2025-09-05 08:56:06.119224', 'step': 880, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:56:06.284103', 'step': 880, 'epoch': 1} {'type': 'loss', 'content': 0.35168516635894775, 'timestamp': '2025-09-05 08:56:06.286070', 'step': 881, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:56:06.485122', 'step': 881, 'epoch': 1} {'type': 'loss', 'content': 0.4242359697818756, 'timestamp': '2025-09-05 08:56:06.487493', 'step': 882, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:06.658498', 'step': 882, 'epoch': 1} {'type': 'loss', 'content': 0.3520510792732239, 'timestamp': '2025-09-05 08:56:06.660398', 'step': 883, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:06.866185', 'step': 883, 'epoch': 1} {'type': 'loss', 'content': 0.2529168725013733, 'timestamp': '2025-09-05 08:56:06.880043', 'step': 884, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:56:07.068068', 'step': 884, 'epoch': 1} {'type': 'loss', 'content': 0.4118684232234955, 'timestamp': '2025-09-05 08:56:07.071747', 'step': 885, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:07.283773', 'step': 885, 'epoch': 1} {'type': 'loss', 'content': 0.2805653512477875, 'timestamp': '2025-09-05 08:56:07.290307', 'step': 886, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:07.498225', 'step': 886, 'epoch': 1} {'type': 'loss', 'content': 0.3658747971057892, 'timestamp': '2025-09-05 08:56:07.503757', 'step': 887, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:07.723427', 'step': 887, 'epoch': 1} {'type': 'loss', 'content': 0.5264832973480225, 'timestamp': '2025-09-05 08:56:07.738249', 'step': 888, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:07.931407', 'step': 888, 'epoch': 1} {'type': 'loss', 'content': 0.4625481069087982, 'timestamp': '2025-09-05 08:56:07.934631', 'step': 889, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:08.137874', 'step': 889, 'epoch': 1} {'type': 'loss', 'content': 0.2717623710632324, 'timestamp': '2025-09-05 08:56:08.141167', 'step': 890, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:08.355120', 'step': 890, 'epoch': 1} {'type': 'loss', 'content': 0.26788651943206787, 'timestamp': '2025-09-05 08:56:08.357651', 'step': 891, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:08.556363', 'step': 891, 'epoch': 1} {'type': 'loss', 'content': 0.43440476059913635, 'timestamp': '2025-09-05 08:56:08.572805', 'step': 892, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:08.763294', 'step': 892, 'epoch': 1} {'type': 'loss', 'content': 0.46988850831985474, 'timestamp': '2025-09-05 08:56:08.765546', 'step': 893, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:08.975242', 'step': 893, 'epoch': 1} {'type': 'loss', 'content': 0.35338684916496277, 'timestamp': '2025-09-05 08:56:08.977587', 'step': 894, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:56:09.196975', 'step': 894, 'epoch': 1} {'type': 'loss', 'content': 0.3955008089542389, 'timestamp': '2025-09-05 08:56:09.205880', 'step': 895, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:09.416322', 'step': 895, 'epoch': 1} {'type': 'loss', 'content': 0.4844711422920227, 'timestamp': '2025-09-05 08:56:09.435475', 'step': 896, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:56:09.636116', 'step': 896, 'epoch': 1} {'type': 'loss', 'content': 0.38304269313812256, 'timestamp': '2025-09-05 08:56:09.640059', 'step': 897, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:09.844074', 'step': 897, 'epoch': 1} {'type': 'loss', 'content': 0.4427229166030884, 'timestamp': '2025-09-05 08:56:09.846695', 'step': 898, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:10.052111', 'step': 898, 'epoch': 1} {'type': 'loss', 'content': 0.31587520241737366, 'timestamp': '2025-09-05 08:56:10.054379', 'step': 899, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:10.262257', 'step': 899, 'epoch': 1} {'type': 'loss', 'content': 0.2594102919101715, 'timestamp': '2025-09-05 08:56:10.276857', 'step': 900, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:56:14.955511', 'step': 900, 'epoch': 1} {'type': 'pplx', 'content': 58.8565162716663, 'timestamp': '2025-09-05 08:56:14.957342', 'step': 900, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:56:15.119484', 'step': 900, 'epoch': 1} {'type': 'loss', 'content': 0.3601221740245819, 'timestamp': '2025-09-05 08:56:15.121057', 'step': 901, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:15.326247', 'step': 901, 'epoch': 1} {'type': 'loss', 'content': 0.296526163816452, 'timestamp': '2025-09-05 08:56:15.327954', 'step': 902, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:15.534820', 'step': 902, 'epoch': 1} {'type': 'loss', 'content': 0.2748814821243286, 'timestamp': '2025-09-05 08:56:15.537456', 'step': 903, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:56:15.735119', 'step': 903, 'epoch': 1} {'type': 'loss', 'content': 0.31274929642677307, 'timestamp': '2025-09-05 08:56:15.752496', 'step': 904, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:15.949953', 'step': 904, 'epoch': 1} {'type': 'loss', 'content': 0.45903486013412476, 'timestamp': '2025-09-05 08:56:15.952321', 'step': 905, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:56:16.160410', 'step': 905, 'epoch': 1} {'type': 'loss', 'content': 0.32286933064460754, 'timestamp': '2025-09-05 08:56:16.162171', 'step': 906, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:16.359899', 'step': 906, 'epoch': 1} {'type': 'loss', 'content': 0.3217444121837616, 'timestamp': '2025-09-05 08:56:16.361873', 'step': 907, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:16.556943', 'step': 907, 'epoch': 1} {'type': 'loss', 'content': 0.2954324185848236, 'timestamp': '2025-09-05 08:56:16.571357', 'step': 908, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:56:16.759132', 'step': 908, 'epoch': 1} {'type': 'loss', 'content': 0.3679000735282898, 'timestamp': '2025-09-05 08:56:16.761079', 'step': 909, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:16.959729', 'step': 909, 'epoch': 1} {'type': 'loss', 'content': 0.41062143445014954, 'timestamp': '2025-09-05 08:56:16.961531', 'step': 910, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:17.157829', 'step': 910, 'epoch': 1} {'type': 'loss', 'content': 0.36205968260765076, 'timestamp': '2025-09-05 08:56:17.160372', 'step': 911, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:56:17.356759', 'step': 911, 'epoch': 1} {'type': 'loss', 'content': 0.3409394323825836, 'timestamp': '2025-09-05 08:56:17.365973', 'step': 912, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:17.529633', 'step': 912, 'epoch': 1} {'type': 'loss', 'content': 0.31686097383499146, 'timestamp': '2025-09-05 08:56:17.531168', 'step': 913, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:17.697289', 'step': 913, 'epoch': 1} {'type': 'loss', 'content': 0.22255216538906097, 'timestamp': '2025-09-05 08:56:17.699236', 'step': 914, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:17.905578', 'step': 914, 'epoch': 1} {'type': 'loss', 'content': 0.30591610074043274, 'timestamp': '2025-09-05 08:56:17.907210', 'step': 915, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:18.103761', 'step': 915, 'epoch': 1} {'type': 'loss', 'content': 0.36227649450302124, 'timestamp': '2025-09-05 08:56:18.118540', 'step': 916, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:18.307208', 'step': 916, 'epoch': 1} {'type': 'loss', 'content': 0.29489269852638245, 'timestamp': '2025-09-05 08:56:18.308973', 'step': 917, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:18.506067', 'step': 917, 'epoch': 1} {'type': 'loss', 'content': 0.3366081118583679, 'timestamp': '2025-09-05 08:56:18.507597', 'step': 918, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:18.716065', 'step': 918, 'epoch': 1} {'type': 'loss', 'content': 0.2996135652065277, 'timestamp': '2025-09-05 08:56:18.717925', 'step': 919, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:18.916645', 'step': 919, 'epoch': 1} {'type': 'loss', 'content': 0.31638243794441223, 'timestamp': '2025-09-05 08:56:18.930921', 'step': 920, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:56:23.565560', 'step': 920, 'epoch': 1} {'type': 'pplx', 'content': 58.30528894827069, 'timestamp': '2025-09-05 08:56:23.567482', 'step': 920, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 920', 'timestamp': '2025-09-05 08:56:24.094332', 'step': 920, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:24.266169', 'step': 920, 'epoch': 1} {'type': 'loss', 'content': 0.31981584429740906, 'timestamp': '2025-09-05 08:56:24.268944', 'step': 921, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:24.440635', 'step': 921, 'epoch': 1} {'type': 'loss', 'content': 0.4614734649658203, 'timestamp': '2025-09-05 08:56:24.442107', 'step': 922, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:24.647745', 'step': 922, 'epoch': 1} {'type': 'loss', 'content': 0.3498396575450897, 'timestamp': '2025-09-05 08:56:24.649674', 'step': 923, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:24.855227', 'step': 923, 'epoch': 1} {'type': 'loss', 'content': 0.2465495467185974, 'timestamp': '2025-09-05 08:56:24.864527', 'step': 924, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:25.029444', 'step': 924, 'epoch': 1} {'type': 'loss', 'content': 0.5145090818405151, 'timestamp': '2025-09-05 08:56:25.031415', 'step': 925, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:25.238088', 'step': 925, 'epoch': 1} {'type': 'loss', 'content': 0.3693583607673645, 'timestamp': '2025-09-05 08:56:25.239819', 'step': 926, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:25.437346', 'step': 926, 'epoch': 1} {'type': 'loss', 'content': 0.33343541622161865, 'timestamp': '2025-09-05 08:56:25.439379', 'step': 927, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:25.644585', 'step': 927, 'epoch': 1} {'type': 'loss', 'content': 0.28294891119003296, 'timestamp': '2025-09-05 08:56:25.659020', 'step': 928, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:25.847035', 'step': 928, 'epoch': 1} {'type': 'loss', 'content': 0.3684309720993042, 'timestamp': '2025-09-05 08:56:25.848699', 'step': 929, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:26.044378', 'step': 929, 'epoch': 1} {'type': 'loss', 'content': 0.3786807060241699, 'timestamp': '2025-09-05 08:56:26.046519', 'step': 930, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:26.252989', 'step': 930, 'epoch': 1} {'type': 'loss', 'content': 0.2511294186115265, 'timestamp': '2025-09-05 08:56:26.254868', 'step': 931, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:26.451301', 'step': 931, 'epoch': 1} {'type': 'loss', 'content': 0.26673150062561035, 'timestamp': '2025-09-05 08:56:26.469187', 'step': 932, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:26.664664', 'step': 932, 'epoch': 1} {'type': 'loss', 'content': 0.2586221396923065, 'timestamp': '2025-09-05 08:56:26.667023', 'step': 933, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:26.861586', 'step': 933, 'epoch': 1} {'type': 'loss', 'content': 0.23219725489616394, 'timestamp': '2025-09-05 08:56:26.864119', 'step': 934, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:27.060475', 'step': 934, 'epoch': 1} {'type': 'loss', 'content': 0.3612004816532135, 'timestamp': '2025-09-05 08:56:27.062849', 'step': 935, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:27.258572', 'step': 935, 'epoch': 1} {'type': 'loss', 'content': 0.19465096294879913, 'timestamp': '2025-09-05 08:56:27.274029', 'step': 936, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:27.461704', 'step': 936, 'epoch': 1} {'type': 'loss', 'content': 0.2936038374900818, 'timestamp': '2025-09-05 08:56:27.464576', 'step': 937, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:27.661307', 'step': 937, 'epoch': 1} {'type': 'loss', 'content': 0.34567755460739136, 'timestamp': '2025-09-05 08:56:27.663756', 'step': 938, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:27.869389', 'step': 938, 'epoch': 1} {'type': 'loss', 'content': 0.45169833302497864, 'timestamp': '2025-09-05 08:56:27.873145', 'step': 939, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:28.068724', 'step': 939, 'epoch': 1} {'type': 'loss', 'content': 0.41843459010124207, 'timestamp': '2025-09-05 08:56:28.083793', 'step': 940, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:56:32.727257', 'step': 940, 'epoch': 1} {'type': 'pplx', 'content': 57.96766332265921, 'timestamp': '2025-09-05 08:56:32.729095', 'step': 940, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:56:32.889362', 'step': 940, 'epoch': 1} {'type': 'loss', 'content': 0.33605149388313293, 'timestamp': '2025-09-05 08:56:32.891081', 'step': 941, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:33.057909', 'step': 941, 'epoch': 1} {'type': 'loss', 'content': 0.28017252683639526, 'timestamp': '2025-09-05 08:56:33.059718', 'step': 942, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:33.266409', 'step': 942, 'epoch': 1} {'type': 'loss', 'content': 0.32622429728507996, 'timestamp': '2025-09-05 08:56:33.268164', 'step': 943, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:33.435862', 'step': 943, 'epoch': 1} {'type': 'loss', 'content': 0.3794481158256531, 'timestamp': '2025-09-05 08:56:33.452555', 'step': 944, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:33.650667', 'step': 944, 'epoch': 1} {'type': 'loss', 'content': 0.32684630155563354, 'timestamp': '2025-09-05 08:56:33.652374', 'step': 945, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:56:33.848749', 'step': 945, 'epoch': 1} {'type': 'loss', 'content': 0.34910067915916443, 'timestamp': '2025-09-05 08:56:33.850516', 'step': 946, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:56:34.052773', 'step': 946, 'epoch': 1} {'type': 'loss', 'content': 0.39786896109580994, 'timestamp': '2025-09-05 08:56:34.054439', 'step': 947, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:34.261687', 'step': 947, 'epoch': 1} {'type': 'loss', 'content': 0.410081684589386, 'timestamp': '2025-09-05 08:56:34.276444', 'step': 948, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:56:34.466684', 'step': 948, 'epoch': 1} {'type': 'loss', 'content': 0.5023239850997925, 'timestamp': '2025-09-05 08:56:34.468684', 'step': 949, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:34.664812', 'step': 949, 'epoch': 1} {'type': 'loss', 'content': 0.29403749108314514, 'timestamp': '2025-09-05 08:56:34.667965', 'step': 950, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:34.863532', 'step': 950, 'epoch': 1} {'type': 'loss', 'content': 0.2468639761209488, 'timestamp': '2025-09-05 08:56:34.865338', 'step': 951, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:35.063835', 'step': 951, 'epoch': 1} {'type': 'loss', 'content': 0.41007065773010254, 'timestamp': '2025-09-05 08:56:35.077372', 'step': 952, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:35.272719', 'step': 952, 'epoch': 1} {'type': 'loss', 'content': 0.26287195086479187, 'timestamp': '2025-09-05 08:56:35.275516', 'step': 953, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:35.442519', 'step': 953, 'epoch': 1} {'type': 'loss', 'content': 0.4006151854991913, 'timestamp': '2025-09-05 08:56:35.444495', 'step': 954, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:56:35.650340', 'step': 954, 'epoch': 1} {'type': 'loss', 'content': 0.25933578610420227, 'timestamp': '2025-09-05 08:56:35.653205', 'step': 955, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:35.856923', 'step': 955, 'epoch': 1} {'type': 'loss', 'content': 0.2387603372335434, 'timestamp': '2025-09-05 08:56:35.873711', 'step': 956, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:36.072514', 'step': 956, 'epoch': 1} {'type': 'loss', 'content': 0.3455435633659363, 'timestamp': '2025-09-05 08:56:36.074229', 'step': 957, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:56:36.270676', 'step': 957, 'epoch': 1} {'type': 'loss', 'content': 0.42304477095603943, 'timestamp': '2025-09-05 08:56:36.272861', 'step': 958, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:36.470360', 'step': 958, 'epoch': 1} {'type': 'loss', 'content': 0.5853138566017151, 'timestamp': '2025-09-05 08:56:36.472351', 'step': 959, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:56:36.678570', 'step': 959, 'epoch': 1} {'type': 'loss', 'content': 0.4282821714878082, 'timestamp': '2025-09-05 08:56:36.696107', 'step': 960, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:56:41.365028', 'step': 960, 'epoch': 1} {'type': 'pplx', 'content': 58.0195968066891, 'timestamp': '2025-09-05 08:56:41.367535', 'step': 960, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 960', 'timestamp': '2025-09-05 08:56:41.827347', 'step': 960, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:41.991345', 'step': 960, 'epoch': 1} {'type': 'loss', 'content': 0.38176435232162476, 'timestamp': '2025-09-05 08:56:41.992895', 'step': 961, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:42.188364', 'step': 961, 'epoch': 1} {'type': 'loss', 'content': 0.3238293528556824, 'timestamp': '2025-09-05 08:56:42.190547', 'step': 962, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:42.387838', 'step': 962, 'epoch': 1} {'type': 'loss', 'content': 0.37185245752334595, 'timestamp': '2025-09-05 08:56:42.389442', 'step': 963, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:42.587007', 'step': 963, 'epoch': 1} {'type': 'loss', 'content': 0.33915528655052185, 'timestamp': '2025-09-05 08:56:42.601403', 'step': 964, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:42.789914', 'step': 964, 'epoch': 1} {'type': 'loss', 'content': 0.4303402900695801, 'timestamp': '2025-09-05 08:56:42.792185', 'step': 965, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:42.989673', 'step': 965, 'epoch': 1} {'type': 'loss', 'content': 0.41477546095848083, 'timestamp': '2025-09-05 08:56:42.991198', 'step': 966, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:43.201206', 'step': 966, 'epoch': 1} {'type': 'loss', 'content': 0.3982813060283661, 'timestamp': '2025-09-05 08:56:43.202897', 'step': 967, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:43.399469', 'step': 967, 'epoch': 1} {'type': 'loss', 'content': 0.2835683822631836, 'timestamp': '2025-09-05 08:56:43.413180', 'step': 968, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:56:43.601190', 'step': 968, 'epoch': 1} {'type': 'loss', 'content': 0.39541733264923096, 'timestamp': '2025-09-05 08:56:43.606800', 'step': 969, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:43.812853', 'step': 969, 'epoch': 1} {'type': 'loss', 'content': 0.37673527002334595, 'timestamp': '2025-09-05 08:56:43.814961', 'step': 970, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:44.012986', 'step': 970, 'epoch': 1} {'type': 'loss', 'content': 0.43757012486457825, 'timestamp': '2025-09-05 08:56:44.014992', 'step': 971, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:44.219053', 'step': 971, 'epoch': 1} {'type': 'loss', 'content': 0.34792765974998474, 'timestamp': '2025-09-05 08:56:44.236654', 'step': 972, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:44.435682', 'step': 972, 'epoch': 1} {'type': 'loss', 'content': 0.2519514858722687, 'timestamp': '2025-09-05 08:56:44.437511', 'step': 973, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:44.644146', 'step': 973, 'epoch': 1} {'type': 'loss', 'content': 0.37428492307662964, 'timestamp': '2025-09-05 08:56:44.645999', 'step': 974, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:44.842027', 'step': 974, 'epoch': 1} {'type': 'loss', 'content': 0.4176678955554962, 'timestamp': '2025-09-05 08:56:44.843935', 'step': 975, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:45.041995', 'step': 975, 'epoch': 1} {'type': 'loss', 'content': 0.32866379618644714, 'timestamp': '2025-09-05 08:56:45.056419', 'step': 976, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:45.253585', 'step': 976, 'epoch': 1} {'type': 'loss', 'content': 0.3537275493144989, 'timestamp': '2025-09-05 08:56:45.255728', 'step': 977, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:45.462343', 'step': 977, 'epoch': 1} {'type': 'loss', 'content': 0.39139440655708313, 'timestamp': '2025-09-05 08:56:45.464065', 'step': 978, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:45.660402', 'step': 978, 'epoch': 1} {'type': 'loss', 'content': 0.47776591777801514, 'timestamp': '2025-09-05 08:56:45.662496', 'step': 979, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:45.870532', 'step': 979, 'epoch': 1} {'type': 'loss', 'content': 0.39145949482917786, 'timestamp': '2025-09-05 08:56:45.884767', 'step': 980, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:56:50.525816', 'step': 980, 'epoch': 1} {'type': 'pplx', 'content': 57.20121236465665, 'timestamp': '2025-09-05 08:56:50.527616', 'step': 980, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:50.691990', 'step': 980, 'epoch': 1} {'type': 'loss', 'content': 0.34527283906936646, 'timestamp': '2025-09-05 08:56:50.694050', 'step': 981, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:50.861530', 'step': 981, 'epoch': 1} {'type': 'loss', 'content': 0.43041056394577026, 'timestamp': '2025-09-05 08:56:50.863361', 'step': 982, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:51.068897', 'step': 982, 'epoch': 1} {'type': 'loss', 'content': 0.32550325989723206, 'timestamp': '2025-09-05 08:56:51.070801', 'step': 983, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:51.276949', 'step': 983, 'epoch': 1} {'type': 'loss', 'content': 0.3620685040950775, 'timestamp': '2025-09-05 08:56:51.291433', 'step': 984, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:56:51.479999', 'step': 984, 'epoch': 1} {'type': 'loss', 'content': 0.36788034439086914, 'timestamp': '2025-09-05 08:56:51.482550', 'step': 985, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:56:51.678037', 'step': 985, 'epoch': 1} {'type': 'loss', 'content': 0.20247577130794525, 'timestamp': '2025-09-05 08:56:51.680666', 'step': 986, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:51.887299', 'step': 986, 'epoch': 1} {'type': 'loss', 'content': 0.255616694688797, 'timestamp': '2025-09-05 08:56:51.889216', 'step': 987, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:56:52.085171', 'step': 987, 'epoch': 1} {'type': 'loss', 'content': 0.32949069142341614, 'timestamp': '2025-09-05 08:56:52.099490', 'step': 988, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:56:52.298089', 'step': 988, 'epoch': 1} {'type': 'loss', 'content': 0.2952141761779785, 'timestamp': '2025-09-05 08:56:52.300055', 'step': 989, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:56:52.495645', 'step': 989, 'epoch': 1} {'type': 'loss', 'content': 0.3354770541191101, 'timestamp': '2025-09-05 08:56:52.497604', 'step': 990, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:52.696906', 'step': 990, 'epoch': 1} {'type': 'loss', 'content': 0.4000490605831146, 'timestamp': '2025-09-05 08:56:52.699540', 'step': 991, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:52.895826', 'step': 991, 'epoch': 1} {'type': 'loss', 'content': 0.400611937046051, 'timestamp': '2025-09-05 08:56:52.905288', 'step': 992, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:53.068126', 'step': 992, 'epoch': 1} {'type': 'loss', 'content': 0.41154736280441284, 'timestamp': '2025-09-05 08:56:53.069916', 'step': 993, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:53.289245', 'step': 993, 'epoch': 1} {'type': 'loss', 'content': 0.3314960300922394, 'timestamp': '2025-09-05 08:56:53.291054', 'step': 994, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:53.499477', 'step': 994, 'epoch': 1} {'type': 'loss', 'content': 0.3384896218776703, 'timestamp': '2025-09-05 08:56:53.501707', 'step': 995, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:56:53.708716', 'step': 995, 'epoch': 1} {'type': 'loss', 'content': 0.24565917253494263, 'timestamp': '2025-09-05 08:56:53.723148', 'step': 996, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:53.914157', 'step': 996, 'epoch': 1} {'type': 'loss', 'content': 0.30330580472946167, 'timestamp': '2025-09-05 08:56:53.915892', 'step': 997, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:56:54.113693', 'step': 997, 'epoch': 1} {'type': 'loss', 'content': 0.339424192905426, 'timestamp': '2025-09-05 08:56:54.116077', 'step': 998, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:54.321917', 'step': 998, 'epoch': 1} {'type': 'loss', 'content': 0.3887946605682373, 'timestamp': '2025-09-05 08:56:54.323779', 'step': 999, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:56:54.523115', 'step': 999, 'epoch': 1} {'type': 'loss', 'content': 0.3614341616630554, 'timestamp': '2025-09-05 08:56:54.537460', 'step': 1000, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:56:59.200485', 'step': 1000, 'epoch': 1} {'type': 'pplx', 'content': 56.91533872503707, 'timestamp': '2025-09-05 08:56:59.202606', 'step': 1000, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1000', 'timestamp': '2025-09-05 08:56:59.653697', 'step': 1000, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:56:59.817222', 'step': 1000, 'epoch': 1} {'type': 'loss', 'content': 0.39639365673065186, 'timestamp': '2025-09-05 08:56:59.819075', 'step': 1001, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:56:59.988417', 'step': 1001, 'epoch': 1} {'type': 'loss', 'content': 0.34922510385513306, 'timestamp': '2025-09-05 08:56:59.990486', 'step': 1002, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:00.196565', 'step': 1002, 'epoch': 1} {'type': 'loss', 'content': 0.3727569878101349, 'timestamp': '2025-09-05 08:57:00.198531', 'step': 1003, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:00.366728', 'step': 1003, 'epoch': 1} {'type': 'loss', 'content': 0.4180833697319031, 'timestamp': '2025-09-05 08:57:00.381916', 'step': 1004, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:00.579962', 'step': 1004, 'epoch': 1} {'type': 'loss', 'content': 0.3678615689277649, 'timestamp': '2025-09-05 08:57:00.582147', 'step': 1005, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:00.782046', 'step': 1005, 'epoch': 1} {'type': 'loss', 'content': 0.4152246415615082, 'timestamp': '2025-09-05 08:57:00.783965', 'step': 1006, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:00.983251', 'step': 1006, 'epoch': 1} {'type': 'loss', 'content': 0.25905001163482666, 'timestamp': '2025-09-05 08:57:00.985125', 'step': 1007, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:01.153313', 'step': 1007, 'epoch': 1} {'type': 'loss', 'content': 0.32605719566345215, 'timestamp': '2025-09-05 08:57:01.168446', 'step': 1008, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:01.357148', 'step': 1008, 'epoch': 1} {'type': 'loss', 'content': 0.23667758703231812, 'timestamp': '2025-09-05 08:57:01.359310', 'step': 1009, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:01.556187', 'step': 1009, 'epoch': 1} {'type': 'loss', 'content': 0.32055971026420593, 'timestamp': '2025-09-05 08:57:01.558142', 'step': 1010, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:01.754795', 'step': 1010, 'epoch': 1} {'type': 'loss', 'content': 0.3258388340473175, 'timestamp': '2025-09-05 08:57:01.757186', 'step': 1011, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:01.957019', 'step': 1011, 'epoch': 1} {'type': 'loss', 'content': 0.3455141484737396, 'timestamp': '2025-09-05 08:57:01.971080', 'step': 1012, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:02.159089', 'step': 1012, 'epoch': 1} {'type': 'loss', 'content': 0.33423060178756714, 'timestamp': '2025-09-05 08:57:02.161397', 'step': 1013, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:02.360107', 'step': 1013, 'epoch': 1} {'type': 'loss', 'content': 0.2707538902759552, 'timestamp': '2025-09-05 08:57:02.364072', 'step': 1014, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:57:02.568044', 'step': 1014, 'epoch': 1} {'type': 'loss', 'content': 0.3885761499404907, 'timestamp': '2025-09-05 08:57:02.570516', 'step': 1015, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:02.768993', 'step': 1015, 'epoch': 1} {'type': 'loss', 'content': 0.26470303535461426, 'timestamp': '2025-09-05 08:57:02.783878', 'step': 1016, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:02.975347', 'step': 1016, 'epoch': 1} {'type': 'loss', 'content': 0.3257010877132416, 'timestamp': '2025-09-05 08:57:02.977993', 'step': 1017, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:57:03.175106', 'step': 1017, 'epoch': 1} {'type': 'loss', 'content': 0.34169089794158936, 'timestamp': '2025-09-05 08:57:03.176698', 'step': 1018, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:03.375475', 'step': 1018, 'epoch': 1} {'type': 'loss', 'content': 0.3442467749118805, 'timestamp': '2025-09-05 08:57:03.377243', 'step': 1019, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:03.575558', 'step': 1019, 'epoch': 1} {'type': 'loss', 'content': 0.3116152286529541, 'timestamp': '2025-09-05 08:57:03.592015', 'step': 1020, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:57:08.258421', 'step': 1020, 'epoch': 1} {'type': 'pplx', 'content': 56.72869426129927, 'timestamp': '2025-09-05 08:57:08.260263', 'step': 1020, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:08.421747', 'step': 1020, 'epoch': 1} {'type': 'loss', 'content': 0.40635350346565247, 'timestamp': '2025-09-05 08:57:08.424564', 'step': 1021, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:08.591844', 'step': 1021, 'epoch': 1} {'type': 'loss', 'content': 0.5129305720329285, 'timestamp': '2025-09-05 08:57:08.593715', 'step': 1022, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:08.799002', 'step': 1022, 'epoch': 1} {'type': 'loss', 'content': 0.3052510917186737, 'timestamp': '2025-09-05 08:57:08.800799', 'step': 1023, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:08.997545', 'step': 1023, 'epoch': 1} {'type': 'loss', 'content': 0.29209795594215393, 'timestamp': '2025-09-05 08:57:09.011812', 'step': 1024, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:09.200301', 'step': 1024, 'epoch': 1} {'type': 'loss', 'content': 0.4792327284812927, 'timestamp': '2025-09-05 08:57:09.202863', 'step': 1025, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:09.370832', 'step': 1025, 'epoch': 1} {'type': 'loss', 'content': 0.2132500410079956, 'timestamp': '2025-09-05 08:57:09.373056', 'step': 1026, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:09.569420', 'step': 1026, 'epoch': 1} {'type': 'loss', 'content': 0.4008413553237915, 'timestamp': '2025-09-05 08:57:09.571230', 'step': 1027, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:09.775965', 'step': 1027, 'epoch': 1} {'type': 'loss', 'content': 0.40069666504859924, 'timestamp': '2025-09-05 08:57:09.785979', 'step': 1028, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:09.951269', 'step': 1028, 'epoch': 1} {'type': 'loss', 'content': 0.4267215132713318, 'timestamp': '2025-09-05 08:57:09.953022', 'step': 1029, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:10.158013', 'step': 1029, 'epoch': 1} {'type': 'loss', 'content': 0.38750991225242615, 'timestamp': '2025-09-05 08:57:10.159677', 'step': 1030, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:10.352728', 'step': 1030, 'epoch': 1} {'type': 'loss', 'content': 0.26181745529174805, 'timestamp': '2025-09-05 08:57:10.354524', 'step': 1031, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:10.560112', 'step': 1031, 'epoch': 1} {'type': 'loss', 'content': 0.27084633708000183, 'timestamp': '2025-09-05 08:57:10.576594', 'step': 1032, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:10.772045', 'step': 1032, 'epoch': 1} {'type': 'loss', 'content': 0.29004645347595215, 'timestamp': '2025-09-05 08:57:10.774068', 'step': 1033, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:10.940422', 'step': 1033, 'epoch': 1} {'type': 'loss', 'content': 0.4643933176994324, 'timestamp': '2025-09-05 08:57:10.942597', 'step': 1034, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:57:11.150075', 'step': 1034, 'epoch': 1} {'type': 'loss', 'content': 0.42606353759765625, 'timestamp': '2025-09-05 08:57:11.151790', 'step': 1035, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:11.358130', 'step': 1035, 'epoch': 1} {'type': 'loss', 'content': 0.5070635676383972, 'timestamp': '2025-09-05 08:57:11.375711', 'step': 1036, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:11.572391', 'step': 1036, 'epoch': 1} {'type': 'loss', 'content': 0.3080455958843231, 'timestamp': '2025-09-05 08:57:11.574129', 'step': 1037, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:11.778556', 'step': 1037, 'epoch': 1} {'type': 'loss', 'content': 0.25634804368019104, 'timestamp': '2025-09-05 08:57:11.780617', 'step': 1038, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:57:11.987608', 'step': 1038, 'epoch': 1} {'type': 'loss', 'content': 0.3523538410663605, 'timestamp': '2025-09-05 08:57:11.989435', 'step': 1039, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:12.186193', 'step': 1039, 'epoch': 1} {'type': 'loss', 'content': 0.353440523147583, 'timestamp': '2025-09-05 08:57:12.195768', 'step': 1040, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:57:16.862221', 'step': 1040, 'epoch': 1} {'type': 'pplx', 'content': 56.865375266262824, 'timestamp': '2025-09-05 08:57:16.865926', 'step': 1040, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1040', 'timestamp': '2025-09-05 08:57:17.336759', 'step': 1040, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:17.507192', 'step': 1040, 'epoch': 1} {'type': 'loss', 'content': 0.4089427888393402, 'timestamp': '2025-09-05 08:57:17.510045', 'step': 1041, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:17.715791', 'step': 1041, 'epoch': 1} {'type': 'loss', 'content': 0.35010460019111633, 'timestamp': '2025-09-05 08:57:17.718881', 'step': 1042, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:17.914889', 'step': 1042, 'epoch': 1} {'type': 'loss', 'content': 0.2957332134246826, 'timestamp': '2025-09-05 08:57:17.917349', 'step': 1043, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:18.114054', 'step': 1043, 'epoch': 1} {'type': 'loss', 'content': 0.25546780228614807, 'timestamp': '2025-09-05 08:57:18.128508', 'step': 1044, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:18.329998', 'step': 1044, 'epoch': 1} {'type': 'loss', 'content': 0.29871493577957153, 'timestamp': '2025-09-05 08:57:18.332151', 'step': 1045, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:18.501032', 'step': 1045, 'epoch': 1} {'type': 'loss', 'content': 0.3960125744342804, 'timestamp': '2025-09-05 08:57:18.503962', 'step': 1046, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:18.711095', 'step': 1046, 'epoch': 1} {'type': 'loss', 'content': 0.2644537091255188, 'timestamp': '2025-09-05 08:57:18.713113', 'step': 1047, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:18.910983', 'step': 1047, 'epoch': 1} {'type': 'loss', 'content': 0.30242469906806946, 'timestamp': '2025-09-05 08:57:18.920490', 'step': 1048, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:19.084703', 'step': 1048, 'epoch': 1} {'type': 'loss', 'content': 0.43861666321754456, 'timestamp': '2025-09-05 08:57:19.087028', 'step': 1049, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:19.256016', 'step': 1049, 'epoch': 1} {'type': 'loss', 'content': 0.26393255591392517, 'timestamp': '2025-09-05 08:57:19.258205', 'step': 1050, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:19.464084', 'step': 1050, 'epoch': 1} {'type': 'loss', 'content': 0.29509156942367554, 'timestamp': '2025-09-05 08:57:19.466567', 'step': 1051, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:57:19.637608', 'step': 1051, 'epoch': 1} {'type': 'loss', 'content': 0.26441511511802673, 'timestamp': '2025-09-05 08:57:19.654587', 'step': 1052, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:19.852491', 'step': 1052, 'epoch': 1} {'type': 'loss', 'content': 0.3607224225997925, 'timestamp': '2025-09-05 08:57:19.854411', 'step': 1053, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:57:20.026434', 'step': 1053, 'epoch': 1} {'type': 'loss', 'content': 0.41068440675735474, 'timestamp': '2025-09-05 08:57:20.028325', 'step': 1054, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:20.238848', 'step': 1054, 'epoch': 1} {'type': 'loss', 'content': 0.3460855484008789, 'timestamp': '2025-09-05 08:57:20.241070', 'step': 1055, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:20.438914', 'step': 1055, 'epoch': 1} {'type': 'loss', 'content': 0.22883664071559906, 'timestamp': '2025-09-05 08:57:20.455318', 'step': 1056, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:20.652501', 'step': 1056, 'epoch': 1} {'type': 'loss', 'content': 0.3002649247646332, 'timestamp': '2025-09-05 08:57:20.654137', 'step': 1057, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:57:20.859648', 'step': 1057, 'epoch': 1} {'type': 'loss', 'content': 0.3611856698989868, 'timestamp': '2025-09-05 08:57:20.862321', 'step': 1058, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:21.030886', 'step': 1058, 'epoch': 1} {'type': 'loss', 'content': 0.34930041432380676, 'timestamp': '2025-09-05 08:57:21.033038', 'step': 1059, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:21.241166', 'step': 1059, 'epoch': 1} {'type': 'loss', 'content': 0.2977052628993988, 'timestamp': '2025-09-05 08:57:21.255758', 'step': 1060, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:57:25.941683', 'step': 1060, 'epoch': 1} {'type': 'pplx', 'content': 58.08633734713871, 'timestamp': '2025-09-05 08:57:25.943919', 'step': 1060, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:57:26.106655', 'step': 1060, 'epoch': 1} {'type': 'loss', 'content': 0.3529953360557556, 'timestamp': '2025-09-05 08:57:26.109260', 'step': 1061, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:26.276963', 'step': 1061, 'epoch': 1} {'type': 'loss', 'content': 0.25426262617111206, 'timestamp': '2025-09-05 08:57:26.278757', 'step': 1062, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:26.484665', 'step': 1062, 'epoch': 1} {'type': 'loss', 'content': 0.39096131920814514, 'timestamp': '2025-09-05 08:57:26.486460', 'step': 1063, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:26.682998', 'step': 1063, 'epoch': 1} {'type': 'loss', 'content': 0.39098745584487915, 'timestamp': '2025-09-05 08:57:26.697339', 'step': 1064, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:26.885072', 'step': 1064, 'epoch': 1} {'type': 'loss', 'content': 0.39723387360572815, 'timestamp': '2025-09-05 08:57:26.887006', 'step': 1065, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:27.083172', 'step': 1065, 'epoch': 1} {'type': 'loss', 'content': 0.3086792528629303, 'timestamp': '2025-09-05 08:57:27.085530', 'step': 1066, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:57:27.283705', 'step': 1066, 'epoch': 1} {'type': 'loss', 'content': 0.3772234320640564, 'timestamp': '2025-09-05 08:57:27.285628', 'step': 1067, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:57:27.483230', 'step': 1067, 'epoch': 1} {'type': 'loss', 'content': 0.44487428665161133, 'timestamp': '2025-09-05 08:57:27.497543', 'step': 1068, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:57:27.686727', 'step': 1068, 'epoch': 1} {'type': 'loss', 'content': 0.2712862193584442, 'timestamp': '2025-09-05 08:57:27.688749', 'step': 1069, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:27.899909', 'step': 1069, 'epoch': 1} {'type': 'loss', 'content': 0.2736116647720337, 'timestamp': '2025-09-05 08:57:27.902023', 'step': 1070, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:28.104397', 'step': 1070, 'epoch': 1} {'type': 'loss', 'content': 0.2640036344528198, 'timestamp': '2025-09-05 08:57:28.106341', 'step': 1071, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:57:28.301834', 'step': 1071, 'epoch': 1} {'type': 'loss', 'content': 0.25708210468292236, 'timestamp': '2025-09-05 08:57:28.316390', 'step': 1072, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:28.502823', 'step': 1072, 'epoch': 1} {'type': 'loss', 'content': 0.4192257523536682, 'timestamp': '2025-09-05 08:57:28.504604', 'step': 1073, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:28.702247', 'step': 1073, 'epoch': 1} {'type': 'loss', 'content': 0.5188899040222168, 'timestamp': '2025-09-05 08:57:28.704344', 'step': 1074, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:28.910313', 'step': 1074, 'epoch': 1} {'type': 'loss', 'content': 0.47707605361938477, 'timestamp': '2025-09-05 08:57:28.912539', 'step': 1075, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:57:29.123376', 'step': 1075, 'epoch': 1} {'type': 'loss', 'content': 0.2549794614315033, 'timestamp': '2025-09-05 08:57:29.140734', 'step': 1076, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:29.340155', 'step': 1076, 'epoch': 1} {'type': 'loss', 'content': 0.32116273045539856, 'timestamp': '2025-09-05 08:57:29.342027', 'step': 1077, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:29.537300', 'step': 1077, 'epoch': 1} {'type': 'loss', 'content': 0.2507237195968628, 'timestamp': '2025-09-05 08:57:29.539351', 'step': 1078, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:29.744422', 'step': 1078, 'epoch': 1} {'type': 'loss', 'content': 0.30762243270874023, 'timestamp': '2025-09-05 08:57:29.746343', 'step': 1079, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:29.943339', 'step': 1079, 'epoch': 1} {'type': 'loss', 'content': 0.3563655614852905, 'timestamp': '2025-09-05 08:57:29.960859', 'step': 1080, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:57:34.630014', 'step': 1080, 'epoch': 1} {'type': 'pplx', 'content': 57.13828478413463, 'timestamp': '2025-09-05 08:57:34.632225', 'step': 1080, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1080', 'timestamp': '2025-09-05 08:57:35.114220', 'step': 1080, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:35.297356', 'step': 1080, 'epoch': 1} {'type': 'loss', 'content': 0.3543899655342102, 'timestamp': '2025-09-05 08:57:35.299618', 'step': 1081, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:35.505794', 'step': 1081, 'epoch': 1} {'type': 'loss', 'content': 0.37011322379112244, 'timestamp': '2025-09-05 08:57:35.507707', 'step': 1082, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:35.705175', 'step': 1082, 'epoch': 1} {'type': 'loss', 'content': 0.29471492767333984, 'timestamp': '2025-09-05 08:57:35.706975', 'step': 1083, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:35.903105', 'step': 1083, 'epoch': 1} {'type': 'loss', 'content': 0.40031829476356506, 'timestamp': '2025-09-05 08:57:35.917669', 'step': 1084, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:36.107208', 'step': 1084, 'epoch': 1} {'type': 'loss', 'content': 0.25579631328582764, 'timestamp': '2025-09-05 08:57:36.109233', 'step': 1085, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:36.314564', 'step': 1085, 'epoch': 1} {'type': 'loss', 'content': 0.30125442147254944, 'timestamp': '2025-09-05 08:57:36.316525', 'step': 1086, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:36.522862', 'step': 1086, 'epoch': 1} {'type': 'loss', 'content': 0.3710455000400543, 'timestamp': '2025-09-05 08:57:36.525296', 'step': 1087, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:36.722872', 'step': 1087, 'epoch': 1} {'type': 'loss', 'content': 0.38871708512306213, 'timestamp': '2025-09-05 08:57:36.736903', 'step': 1088, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:36.925159', 'step': 1088, 'epoch': 1} {'type': 'loss', 'content': 0.4635475277900696, 'timestamp': '2025-09-05 08:57:36.926826', 'step': 1089, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:57:37.121132', 'step': 1089, 'epoch': 1} {'type': 'loss', 'content': 0.3786855936050415, 'timestamp': '2025-09-05 08:57:37.123066', 'step': 1090, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:57:37.320185', 'step': 1090, 'epoch': 1} {'type': 'loss', 'content': 0.29588383436203003, 'timestamp': '2025-09-05 08:57:37.321973', 'step': 1091, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:37.516900', 'step': 1091, 'epoch': 1} {'type': 'loss', 'content': 0.2814580500125885, 'timestamp': '2025-09-05 08:57:37.531031', 'step': 1092, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:37.719213', 'step': 1092, 'epoch': 1} {'type': 'loss', 'content': 0.29636815190315247, 'timestamp': '2025-09-05 08:57:37.721048', 'step': 1093, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:57:37.888769', 'step': 1093, 'epoch': 1} {'type': 'loss', 'content': 0.30050742626190186, 'timestamp': '2025-09-05 08:57:37.890910', 'step': 1094, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:38.095014', 'step': 1094, 'epoch': 1} {'type': 'loss', 'content': 0.21762610971927643, 'timestamp': '2025-09-05 08:57:38.096704', 'step': 1095, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:38.293599', 'step': 1095, 'epoch': 1} {'type': 'loss', 'content': 0.36448773741722107, 'timestamp': '2025-09-05 08:57:38.308108', 'step': 1096, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:38.495244', 'step': 1096, 'epoch': 1} {'type': 'loss', 'content': 0.2675822079181671, 'timestamp': '2025-09-05 08:57:38.497086', 'step': 1097, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:57:38.703191', 'step': 1097, 'epoch': 1} {'type': 'loss', 'content': 0.3438616693019867, 'timestamp': '2025-09-05 08:57:38.705356', 'step': 1098, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:57:38.901089', 'step': 1098, 'epoch': 1} {'type': 'loss', 'content': 0.25544899702072144, 'timestamp': '2025-09-05 08:57:38.903018', 'step': 1099, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:39.109292', 'step': 1099, 'epoch': 1} {'type': 'loss', 'content': 0.15850801765918732, 'timestamp': '2025-09-05 08:57:39.123875', 'step': 1100, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:57:43.756122', 'step': 1100, 'epoch': 1} {'type': 'pplx', 'content': 55.68282147255737, 'timestamp': '2025-09-05 08:57:43.758533', 'step': 1100, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:43.921152', 'step': 1100, 'epoch': 1} {'type': 'loss', 'content': 0.4078519344329834, 'timestamp': '2025-09-05 08:57:43.923717', 'step': 1101, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:44.130660', 'step': 1101, 'epoch': 1} {'type': 'loss', 'content': 0.31081441044807434, 'timestamp': '2025-09-05 08:57:44.132606', 'step': 1102, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:44.340156', 'step': 1102, 'epoch': 1} {'type': 'loss', 'content': 0.14278367161750793, 'timestamp': '2025-09-05 08:57:44.342528', 'step': 1103, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:44.549416', 'step': 1103, 'epoch': 1} {'type': 'loss', 'content': 0.2088335156440735, 'timestamp': '2025-09-05 08:57:44.563823', 'step': 1104, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:44.760932', 'step': 1104, 'epoch': 1} {'type': 'loss', 'content': 0.3414754569530487, 'timestamp': '2025-09-05 08:57:44.762833', 'step': 1105, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:44.960267', 'step': 1105, 'epoch': 1} {'type': 'loss', 'content': 0.43541449308395386, 'timestamp': '2025-09-05 08:57:44.962499', 'step': 1106, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:45.170538', 'step': 1106, 'epoch': 1} {'type': 'loss', 'content': 0.38098201155662537, 'timestamp': '2025-09-05 08:57:45.173030', 'step': 1107, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 5440033091648.0}, 'timestamp': '2025-09-05 08:57:45.374443', 'step': 1107, 'epoch': 1} {'type': 'loss', 'content': 0.49568867683410645, 'timestamp': '2025-09-05 08:57:45.391316', 'step': 1108, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:45.590356', 'step': 1108, 'epoch': 1} {'type': 'loss', 'content': 0.20845304429531097, 'timestamp': '2025-09-05 08:57:45.592498', 'step': 1109, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:45.789652', 'step': 1109, 'epoch': 1} {'type': 'loss', 'content': 0.34582969546318054, 'timestamp': '2025-09-05 08:57:45.791657', 'step': 1110, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:45.995088', 'step': 1110, 'epoch': 1} {'type': 'loss', 'content': 0.30466410517692566, 'timestamp': '2025-09-05 08:57:45.997298', 'step': 1111, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:46.192699', 'step': 1111, 'epoch': 1} {'type': 'loss', 'content': 0.3194935917854309, 'timestamp': '2025-09-05 08:57:46.209949', 'step': 1112, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:46.408493', 'step': 1112, 'epoch': 1} {'type': 'loss', 'content': 0.3247276246547699, 'timestamp': '2025-09-05 08:57:46.410457', 'step': 1113, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:46.617098', 'step': 1113, 'epoch': 1} {'type': 'loss', 'content': 0.26578959822654724, 'timestamp': '2025-09-05 08:57:46.618843', 'step': 1114, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:46.818714', 'step': 1114, 'epoch': 1} {'type': 'loss', 'content': 0.48078733682632446, 'timestamp': '2025-09-05 08:57:46.820884', 'step': 1115, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:47.016092', 'step': 1115, 'epoch': 1} {'type': 'loss', 'content': 0.3435327410697937, 'timestamp': '2025-09-05 08:57:47.030516', 'step': 1116, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:57:47.216523', 'step': 1116, 'epoch': 1} {'type': 'loss', 'content': 0.3160141408443451, 'timestamp': '2025-09-05 08:57:47.218401', 'step': 1117, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:47.412594', 'step': 1117, 'epoch': 1} {'type': 'loss', 'content': 0.3424045145511627, 'timestamp': '2025-09-05 08:57:47.414440', 'step': 1118, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:47.610879', 'step': 1118, 'epoch': 1} {'type': 'loss', 'content': 0.29838377237319946, 'timestamp': '2025-09-05 08:57:47.612751', 'step': 1119, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:47.808963', 'step': 1119, 'epoch': 1} {'type': 'loss', 'content': 0.2777602970600128, 'timestamp': '2025-09-05 08:57:47.823432', 'step': 1120, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:57:52.452094', 'step': 1120, 'epoch': 1} {'type': 'pplx', 'content': 56.402618878722485, 'timestamp': '2025-09-05 08:57:52.454888', 'step': 1120, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1120', 'timestamp': '2025-09-05 08:57:52.924333', 'step': 1120, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:57:53.084960', 'step': 1120, 'epoch': 1} {'type': 'loss', 'content': 0.2436535656452179, 'timestamp': '2025-09-05 08:57:53.086827', 'step': 1121, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:53.289641', 'step': 1121, 'epoch': 1} {'type': 'loss', 'content': 0.3066195845603943, 'timestamp': '2025-09-05 08:57:53.291300', 'step': 1122, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:57:53.499294', 'step': 1122, 'epoch': 1} {'type': 'loss', 'content': 0.210659921169281, 'timestamp': '2025-09-05 08:57:53.501932', 'step': 1123, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:57:53.701662', 'step': 1123, 'epoch': 1} {'type': 'loss', 'content': 0.42288801074028015, 'timestamp': '2025-09-05 08:57:53.718355', 'step': 1124, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:53.917047', 'step': 1124, 'epoch': 1} {'type': 'loss', 'content': 0.27715227007865906, 'timestamp': '2025-09-05 08:57:53.918967', 'step': 1125, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:54.085170', 'step': 1125, 'epoch': 1} {'type': 'loss', 'content': 0.4282056987285614, 'timestamp': '2025-09-05 08:57:54.087062', 'step': 1126, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:57:54.280878', 'step': 1126, 'epoch': 1} {'type': 'loss', 'content': 0.3095110356807709, 'timestamp': '2025-09-05 08:57:54.283528', 'step': 1127, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:54.489385', 'step': 1127, 'epoch': 1} {'type': 'loss', 'content': 0.3560994267463684, 'timestamp': '2025-09-05 08:57:54.504279', 'step': 1128, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:54.706093', 'step': 1128, 'epoch': 1} {'type': 'loss', 'content': 0.5486235618591309, 'timestamp': '2025-09-05 08:57:54.708108', 'step': 1129, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:54.915209', 'step': 1129, 'epoch': 1} {'type': 'loss', 'content': 0.2991634011268616, 'timestamp': '2025-09-05 08:57:54.917880', 'step': 1130, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:57:55.116971', 'step': 1130, 'epoch': 1} {'type': 'loss', 'content': 0.323395311832428, 'timestamp': '2025-09-05 08:57:55.119750', 'step': 1131, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:57:55.315888', 'step': 1131, 'epoch': 1} {'type': 'loss', 'content': 0.2886749505996704, 'timestamp': '2025-09-05 08:57:55.330392', 'step': 1132, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:55.529682', 'step': 1132, 'epoch': 1} {'type': 'loss', 'content': 0.3267287313938141, 'timestamp': '2025-09-05 08:57:55.531485', 'step': 1133, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:55.728151', 'step': 1133, 'epoch': 1} {'type': 'loss', 'content': 0.38941022753715515, 'timestamp': '2025-09-05 08:57:55.730008', 'step': 1134, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:57:55.924848', 'step': 1134, 'epoch': 1} {'type': 'loss', 'content': 0.3658711016178131, 'timestamp': '2025-09-05 08:57:55.926503', 'step': 1135, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:56.093798', 'step': 1135, 'epoch': 1} {'type': 'loss', 'content': 0.34820908308029175, 'timestamp': '2025-09-05 08:57:56.110544', 'step': 1136, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:56.308140', 'step': 1136, 'epoch': 1} {'type': 'loss', 'content': 0.3882652819156647, 'timestamp': '2025-09-05 08:57:56.310266', 'step': 1137, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:57:56.479788', 'step': 1137, 'epoch': 1} {'type': 'loss', 'content': 0.17920145392417908, 'timestamp': '2025-09-05 08:57:56.482460', 'step': 1138, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:57:56.689791', 'step': 1138, 'epoch': 1} {'type': 'loss', 'content': 0.21023604273796082, 'timestamp': '2025-09-05 08:57:56.692566', 'step': 1139, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:57:56.889514', 'step': 1139, 'epoch': 1} {'type': 'loss', 'content': 0.28918132185935974, 'timestamp': '2025-09-05 08:57:56.904085', 'step': 1140, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:58:01.541928', 'step': 1140, 'epoch': 1} {'type': 'pplx', 'content': 56.62639246330814, 'timestamp': '2025-09-05 08:58:01.543982', 'step': 1140, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:01.704731', 'step': 1140, 'epoch': 1} {'type': 'loss', 'content': 0.2688441276550293, 'timestamp': '2025-09-05 08:58:01.707270', 'step': 1141, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:01.918528', 'step': 1141, 'epoch': 1} {'type': 'loss', 'content': 0.14376114308834076, 'timestamp': '2025-09-05 08:58:01.923187', 'step': 1142, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:02.134322', 'step': 1142, 'epoch': 1} {'type': 'loss', 'content': 0.375431627035141, 'timestamp': '2025-09-05 08:58:02.136938', 'step': 1143, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:02.351335', 'step': 1143, 'epoch': 1} {'type': 'loss', 'content': 0.3484734892845154, 'timestamp': '2025-09-05 08:58:02.371162', 'step': 1144, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:02.572337', 'step': 1144, 'epoch': 1} {'type': 'loss', 'content': 0.3586747646331787, 'timestamp': '2025-09-05 08:58:02.573999', 'step': 1145, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:02.768214', 'step': 1145, 'epoch': 1} {'type': 'loss', 'content': 0.339052677154541, 'timestamp': '2025-09-05 08:58:02.770566', 'step': 1146, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:02.966732', 'step': 1146, 'epoch': 1} {'type': 'loss', 'content': 0.38937392830848694, 'timestamp': '2025-09-05 08:58:02.969114', 'step': 1147, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:58:03.175468', 'step': 1147, 'epoch': 1} {'type': 'loss', 'content': 0.5443062782287598, 'timestamp': '2025-09-05 08:58:03.191793', 'step': 1148, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:03.390206', 'step': 1148, 'epoch': 1} {'type': 'loss', 'content': 0.2671515643596649, 'timestamp': '2025-09-05 08:58:03.392071', 'step': 1149, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:03.588804', 'step': 1149, 'epoch': 1} {'type': 'loss', 'content': 0.31323206424713135, 'timestamp': '2025-09-05 08:58:03.591276', 'step': 1150, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:03.788661', 'step': 1150, 'epoch': 1} {'type': 'loss', 'content': 0.4581383764743805, 'timestamp': '2025-09-05 08:58:03.790937', 'step': 1151, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:03.996509', 'step': 1151, 'epoch': 1} {'type': 'loss', 'content': 0.3642024099826813, 'timestamp': '2025-09-05 08:58:04.011304', 'step': 1152, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:04.200325', 'step': 1152, 'epoch': 1} {'type': 'loss', 'content': 0.28851771354675293, 'timestamp': '2025-09-05 08:58:04.202081', 'step': 1153, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:04.398012', 'step': 1153, 'epoch': 1} {'type': 'loss', 'content': 0.297490656375885, 'timestamp': '2025-09-05 08:58:04.399844', 'step': 1154, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:04.595632', 'step': 1154, 'epoch': 1} {'type': 'loss', 'content': 0.3883640766143799, 'timestamp': '2025-09-05 08:58:04.597484', 'step': 1155, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:04.793403', 'step': 1155, 'epoch': 1} {'type': 'loss', 'content': 0.4353066086769104, 'timestamp': '2025-09-05 08:58:04.809872', 'step': 1156, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:05.007788', 'step': 1156, 'epoch': 1} {'type': 'loss', 'content': 0.41394755244255066, 'timestamp': '2025-09-05 08:58:05.010408', 'step': 1157, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:05.210965', 'step': 1157, 'epoch': 1} {'type': 'loss', 'content': 0.4061930477619171, 'timestamp': '2025-09-05 08:58:05.213045', 'step': 1158, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:05.379194', 'step': 1158, 'epoch': 1} {'type': 'loss', 'content': 0.39679089188575745, 'timestamp': '2025-09-05 08:58:05.382513', 'step': 1159, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:58:05.589595', 'step': 1159, 'epoch': 1} {'type': 'loss', 'content': 0.26510128378868103, 'timestamp': '2025-09-05 08:58:05.604655', 'step': 1160, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:58:10.260868', 'step': 1160, 'epoch': 1} {'type': 'pplx', 'content': 55.69607325889674, 'timestamp': '2025-09-05 08:58:10.262863', 'step': 1160, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1160', 'timestamp': '2025-09-05 08:58:10.713992', 'step': 1160, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:10.883875', 'step': 1160, 'epoch': 1} {'type': 'loss', 'content': 0.2949378490447998, 'timestamp': '2025-09-05 08:58:10.885977', 'step': 1161, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:11.082355', 'step': 1161, 'epoch': 1} {'type': 'loss', 'content': 0.481171578168869, 'timestamp': '2025-09-05 08:58:11.083958', 'step': 1162, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:11.280145', 'step': 1162, 'epoch': 1} {'type': 'loss', 'content': 0.3129526972770691, 'timestamp': '2025-09-05 08:58:11.281934', 'step': 1163, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:11.476684', 'step': 1163, 'epoch': 1} {'type': 'loss', 'content': 0.3266424238681793, 'timestamp': '2025-09-05 08:58:11.491572', 'step': 1164, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:11.679597', 'step': 1164, 'epoch': 1} {'type': 'loss', 'content': 0.23583292961120605, 'timestamp': '2025-09-05 08:58:11.681817', 'step': 1165, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:11.882166', 'step': 1165, 'epoch': 1} {'type': 'loss', 'content': 0.5377989411354065, 'timestamp': '2025-09-05 08:58:11.885749', 'step': 1166, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:12.056875', 'step': 1166, 'epoch': 1} {'type': 'loss', 'content': 0.3818263113498688, 'timestamp': '2025-09-05 08:58:12.059180', 'step': 1167, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:12.265640', 'step': 1167, 'epoch': 1} {'type': 'loss', 'content': 0.41938114166259766, 'timestamp': '2025-09-05 08:58:12.277483', 'step': 1168, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:12.447993', 'step': 1168, 'epoch': 1} {'type': 'loss', 'content': 0.17846006155014038, 'timestamp': '2025-09-05 08:58:12.449643', 'step': 1169, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:12.656563', 'step': 1169, 'epoch': 1} {'type': 'loss', 'content': 0.3130786418914795, 'timestamp': '2025-09-05 08:58:12.658864', 'step': 1170, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:12.856448', 'step': 1170, 'epoch': 1} {'type': 'loss', 'content': 0.24914269149303436, 'timestamp': '2025-09-05 08:58:12.858087', 'step': 1171, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:13.053886', 'step': 1171, 'epoch': 1} {'type': 'loss', 'content': 0.3340952694416046, 'timestamp': '2025-09-05 08:58:13.068458', 'step': 1172, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:13.258558', 'step': 1172, 'epoch': 1} {'type': 'loss', 'content': 0.3117923438549042, 'timestamp': '2025-09-05 08:58:13.260298', 'step': 1173, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:13.428124', 'step': 1173, 'epoch': 1} {'type': 'loss', 'content': 0.36320504546165466, 'timestamp': '2025-09-05 08:58:13.430056', 'step': 1174, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:13.597395', 'step': 1174, 'epoch': 1} {'type': 'loss', 'content': 0.3952508866786957, 'timestamp': '2025-09-05 08:58:13.599309', 'step': 1175, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:13.809227', 'step': 1175, 'epoch': 1} {'type': 'loss', 'content': 0.3737662136554718, 'timestamp': '2025-09-05 08:58:13.823689', 'step': 1176, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:14.012685', 'step': 1176, 'epoch': 1} {'type': 'loss', 'content': 0.4855545461177826, 'timestamp': '2025-09-05 08:58:14.014560', 'step': 1177, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:14.210707', 'step': 1177, 'epoch': 1} {'type': 'loss', 'content': 0.489812970161438, 'timestamp': '2025-09-05 08:58:14.212582', 'step': 1178, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:14.418706', 'step': 1178, 'epoch': 1} {'type': 'loss', 'content': 0.3980942666530609, 'timestamp': '2025-09-05 08:58:14.423470', 'step': 1179, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:14.629308', 'step': 1179, 'epoch': 1} {'type': 'loss', 'content': 0.4134223163127899, 'timestamp': '2025-09-05 08:58:14.645650', 'step': 1180, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:58:19.351980', 'step': 1180, 'epoch': 1} {'type': 'pplx', 'content': 54.78913997248039, 'timestamp': '2025-09-05 08:58:19.355359', 'step': 1180, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:19.524235', 'step': 1180, 'epoch': 1} {'type': 'loss', 'content': 0.3536244332790375, 'timestamp': '2025-09-05 08:58:19.526582', 'step': 1181, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:19.725722', 'step': 1181, 'epoch': 1} {'type': 'loss', 'content': 0.2990207076072693, 'timestamp': '2025-09-05 08:58:19.729850', 'step': 1182, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:19.933714', 'step': 1182, 'epoch': 1} {'type': 'loss', 'content': 0.2687399983406067, 'timestamp': '2025-09-05 08:58:19.935632', 'step': 1183, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:58:20.139334', 'step': 1183, 'epoch': 1} {'type': 'loss', 'content': 0.3062213957309723, 'timestamp': '2025-09-05 08:58:20.156178', 'step': 1184, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:20.353342', 'step': 1184, 'epoch': 1} {'type': 'loss', 'content': 0.3144480586051941, 'timestamp': '2025-09-05 08:58:20.355192', 'step': 1185, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:20.552429', 'step': 1185, 'epoch': 1} {'type': 'loss', 'content': 0.34665945172309875, 'timestamp': '2025-09-05 08:58:20.554396', 'step': 1186, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:20.762922', 'step': 1186, 'epoch': 1} {'type': 'loss', 'content': 0.2913441061973572, 'timestamp': '2025-09-05 08:58:20.764934', 'step': 1187, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:20.961203', 'step': 1187, 'epoch': 1} {'type': 'loss', 'content': 0.27978482842445374, 'timestamp': '2025-09-05 08:58:20.977949', 'step': 1188, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:21.176178', 'step': 1188, 'epoch': 1} {'type': 'loss', 'content': 0.3447984755039215, 'timestamp': '2025-09-05 08:58:21.178182', 'step': 1189, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:21.385149', 'step': 1189, 'epoch': 1} {'type': 'loss', 'content': 0.27464714646339417, 'timestamp': '2025-09-05 08:58:21.387222', 'step': 1190, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:21.596698', 'step': 1190, 'epoch': 1} {'type': 'loss', 'content': 0.31913846731185913, 'timestamp': '2025-09-05 08:58:21.600384', 'step': 1191, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:21.804357', 'step': 1191, 'epoch': 1} {'type': 'loss', 'content': 0.3542693257331848, 'timestamp': '2025-09-05 08:58:21.818909', 'step': 1192, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:22.006890', 'step': 1192, 'epoch': 1} {'type': 'loss', 'content': 0.36727648973464966, 'timestamp': '2025-09-05 08:58:22.009547', 'step': 1193, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:22.203981', 'step': 1193, 'epoch': 1} {'type': 'loss', 'content': 0.5776286125183105, 'timestamp': '2025-09-05 08:58:22.206814', 'step': 1194, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:58:22.400605', 'step': 1194, 'epoch': 1} {'type': 'loss', 'content': 0.2967214286327362, 'timestamp': '2025-09-05 08:58:22.403373', 'step': 1195, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:22.596778', 'step': 1195, 'epoch': 1} {'type': 'loss', 'content': 0.3207745850086212, 'timestamp': '2025-09-05 08:58:22.612062', 'step': 1196, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:22.807056', 'step': 1196, 'epoch': 1} {'type': 'loss', 'content': 0.299358606338501, 'timestamp': '2025-09-05 08:58:22.809849', 'step': 1197, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:23.006195', 'step': 1197, 'epoch': 1} {'type': 'loss', 'content': 0.3517630994319916, 'timestamp': '2025-09-05 08:58:23.008969', 'step': 1198, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:23.205294', 'step': 1198, 'epoch': 1} {'type': 'loss', 'content': 0.3210662305355072, 'timestamp': '2025-09-05 08:58:23.208167', 'step': 1199, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:23.402800', 'step': 1199, 'epoch': 1} {'type': 'loss', 'content': 0.2987304925918579, 'timestamp': '2025-09-05 08:58:23.420736', 'step': 1200, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:58:28.151648', 'step': 1200, 'epoch': 1} {'type': 'pplx', 'content': 55.163801136680675, 'timestamp': '2025-09-05 08:58:28.153281', 'step': 1200, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1200', 'timestamp': '2025-09-05 08:58:28.623598', 'step': 1200, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:28.796735', 'step': 1200, 'epoch': 1} {'type': 'loss', 'content': 0.37314802408218384, 'timestamp': '2025-09-05 08:58:28.800241', 'step': 1201, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:29.007449', 'step': 1201, 'epoch': 1} {'type': 'loss', 'content': 0.46742531657218933, 'timestamp': '2025-09-05 08:58:29.009503', 'step': 1202, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:29.221368', 'step': 1202, 'epoch': 1} {'type': 'loss', 'content': 0.3984720706939697, 'timestamp': '2025-09-05 08:58:29.223451', 'step': 1203, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:29.435123', 'step': 1203, 'epoch': 1} {'type': 'loss', 'content': 0.32529622316360474, 'timestamp': '2025-09-05 08:58:29.449351', 'step': 1204, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:29.648833', 'step': 1204, 'epoch': 1} {'type': 'loss', 'content': 0.5382220149040222, 'timestamp': '2025-09-05 08:58:29.650921', 'step': 1205, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:29.849552', 'step': 1205, 'epoch': 1} {'type': 'loss', 'content': 0.38562560081481934, 'timestamp': '2025-09-05 08:58:29.851589', 'step': 1206, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:30.060179', 'step': 1206, 'epoch': 1} {'type': 'loss', 'content': 0.2525237798690796, 'timestamp': '2025-09-05 08:58:30.062452', 'step': 1207, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:30.261557', 'step': 1207, 'epoch': 1} {'type': 'loss', 'content': 0.3691144287586212, 'timestamp': '2025-09-05 08:58:30.279697', 'step': 1208, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:30.480112', 'step': 1208, 'epoch': 1} {'type': 'loss', 'content': 0.28269970417022705, 'timestamp': '2025-09-05 08:58:30.483548', 'step': 1209, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:30.688759', 'step': 1209, 'epoch': 1} {'type': 'loss', 'content': 0.3319048583507538, 'timestamp': '2025-09-05 08:58:30.691126', 'step': 1210, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:30.888760', 'step': 1210, 'epoch': 1} {'type': 'loss', 'content': 0.3252932131290436, 'timestamp': '2025-09-05 08:58:30.890442', 'step': 1211, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:58:31.098575', 'step': 1211, 'epoch': 1} {'type': 'loss', 'content': 0.4231226444244385, 'timestamp': '2025-09-05 08:58:31.115403', 'step': 1212, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:31.305184', 'step': 1212, 'epoch': 1} {'type': 'loss', 'content': 0.28163203597068787, 'timestamp': '2025-09-05 08:58:31.308680', 'step': 1213, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:31.515231', 'step': 1213, 'epoch': 1} {'type': 'loss', 'content': 0.3782075345516205, 'timestamp': '2025-09-05 08:58:31.517092', 'step': 1214, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:31.714360', 'step': 1214, 'epoch': 1} {'type': 'loss', 'content': 0.30757156014442444, 'timestamp': '2025-09-05 08:58:31.716274', 'step': 1215, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:31.912180', 'step': 1215, 'epoch': 1} {'type': 'loss', 'content': 0.46240562200546265, 'timestamp': '2025-09-05 08:58:31.926460', 'step': 1216, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:32.116240', 'step': 1216, 'epoch': 1} {'type': 'loss', 'content': 0.275336891412735, 'timestamp': '2025-09-05 08:58:32.118015', 'step': 1217, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:32.314740', 'step': 1217, 'epoch': 1} {'type': 'loss', 'content': 0.43169447779655457, 'timestamp': '2025-09-05 08:58:32.316516', 'step': 1218, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:32.523346', 'step': 1218, 'epoch': 1} {'type': 'loss', 'content': 0.4832044541835785, 'timestamp': '2025-09-05 08:58:32.525237', 'step': 1219, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:32.726761', 'step': 1219, 'epoch': 1} {'type': 'loss', 'content': 0.3741658627986908, 'timestamp': '2025-09-05 08:58:32.743245', 'step': 1220, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:58:37.465340', 'step': 1220, 'epoch': 1} {'type': 'pplx', 'content': 55.636826007322504, 'timestamp': '2025-09-05 08:58:37.467376', 'step': 1220, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:37.629395', 'step': 1220, 'epoch': 1} {'type': 'loss', 'content': 0.2922039031982422, 'timestamp': '2025-09-05 08:58:37.633868', 'step': 1221, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:37.801769', 'step': 1221, 'epoch': 1} {'type': 'loss', 'content': 0.31432512402534485, 'timestamp': '2025-09-05 08:58:37.806881', 'step': 1222, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:38.020020', 'step': 1222, 'epoch': 1} {'type': 'loss', 'content': 0.273971289396286, 'timestamp': '2025-09-05 08:58:38.021964', 'step': 1223, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:38.217738', 'step': 1223, 'epoch': 1} {'type': 'loss', 'content': 0.2825872302055359, 'timestamp': '2025-09-05 08:58:38.234390', 'step': 1224, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:38.430446', 'step': 1224, 'epoch': 1} {'type': 'loss', 'content': 0.3784426152706146, 'timestamp': '2025-09-05 08:58:38.432294', 'step': 1225, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:38.638082', 'step': 1225, 'epoch': 1} {'type': 'loss', 'content': 0.36815857887268066, 'timestamp': '2025-09-05 08:58:38.639863', 'step': 1226, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:38.846971', 'step': 1226, 'epoch': 1} {'type': 'loss', 'content': 0.3271258771419525, 'timestamp': '2025-09-05 08:58:38.848960', 'step': 1227, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 08:58:39.045679', 'step': 1227, 'epoch': 1} {'type': 'loss', 'content': 0.2493131011724472, 'timestamp': '2025-09-05 08:58:39.060206', 'step': 1228, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:39.255473', 'step': 1228, 'epoch': 1} {'type': 'loss', 'content': 0.2988832890987396, 'timestamp': '2025-09-05 08:58:39.257757', 'step': 1229, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:39.472160', 'step': 1229, 'epoch': 1} {'type': 'loss', 'content': 0.44903451204299927, 'timestamp': '2025-09-05 08:58:39.473964', 'step': 1230, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:39.671824', 'step': 1230, 'epoch': 1} {'type': 'loss', 'content': 0.5419812798500061, 'timestamp': '2025-09-05 08:58:39.674087', 'step': 1231, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:39.882692', 'step': 1231, 'epoch': 1} {'type': 'loss', 'content': 0.3951758146286011, 'timestamp': '2025-09-05 08:58:39.897357', 'step': 1232, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:40.086411', 'step': 1232, 'epoch': 1} {'type': 'loss', 'content': 0.24042554199695587, 'timestamp': '2025-09-05 08:58:40.088327', 'step': 1233, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:40.294687', 'step': 1233, 'epoch': 1} {'type': 'loss', 'content': 0.4239327013492584, 'timestamp': '2025-09-05 08:58:40.296994', 'step': 1234, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:40.497105', 'step': 1234, 'epoch': 1} {'type': 'loss', 'content': 0.19099119305610657, 'timestamp': '2025-09-05 08:58:40.499457', 'step': 1235, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:40.706219', 'step': 1235, 'epoch': 1} {'type': 'loss', 'content': 0.30486881732940674, 'timestamp': '2025-09-05 08:58:40.722373', 'step': 1236, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:40.920116', 'step': 1236, 'epoch': 1} {'type': 'loss', 'content': 0.24796439707279205, 'timestamp': '2025-09-05 08:58:40.921901', 'step': 1237, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:41.118829', 'step': 1237, 'epoch': 1} {'type': 'loss', 'content': 0.2669127881526947, 'timestamp': '2025-09-05 08:58:41.120812', 'step': 1238, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:41.327218', 'step': 1238, 'epoch': 1} {'type': 'loss', 'content': 0.20141269266605377, 'timestamp': '2025-09-05 08:58:41.329910', 'step': 1239, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:41.530544', 'step': 1239, 'epoch': 1} {'type': 'loss', 'content': 0.4013859033584595, 'timestamp': '2025-09-05 08:58:41.547398', 'step': 1240, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:58:46.337252', 'step': 1240, 'epoch': 1} {'type': 'pplx', 'content': 55.81478220379367, 'timestamp': '2025-09-05 08:58:46.339141', 'step': 1240, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1240', 'timestamp': '2025-09-05 08:58:47.010044', 'step': 1240, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:47.179545', 'step': 1240, 'epoch': 1} {'type': 'loss', 'content': 0.3496423661708832, 'timestamp': '2025-09-05 08:58:47.181523', 'step': 1241, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:47.391265', 'step': 1241, 'epoch': 1} {'type': 'loss', 'content': 0.17773263156414032, 'timestamp': '2025-09-05 08:58:47.393443', 'step': 1242, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:47.591587', 'step': 1242, 'epoch': 1} {'type': 'loss', 'content': 0.3922744393348694, 'timestamp': '2025-09-05 08:58:47.593807', 'step': 1243, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:47.792392', 'step': 1243, 'epoch': 1} {'type': 'loss', 'content': 0.39606019854545593, 'timestamp': '2025-09-05 08:58:47.808993', 'step': 1244, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:48.005991', 'step': 1244, 'epoch': 1} {'type': 'loss', 'content': 0.28507742285728455, 'timestamp': '2025-09-05 08:58:48.007751', 'step': 1245, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:48.208527', 'step': 1245, 'epoch': 1} {'type': 'loss', 'content': 0.38115379214286804, 'timestamp': '2025-09-05 08:58:48.210544', 'step': 1246, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:58:48.420393', 'step': 1246, 'epoch': 1} {'type': 'loss', 'content': 0.38982245326042175, 'timestamp': '2025-09-05 08:58:48.423008', 'step': 1247, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:48.621835', 'step': 1247, 'epoch': 1} {'type': 'loss', 'content': 0.3070641756057739, 'timestamp': '2025-09-05 08:58:48.639605', 'step': 1248, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:48.835488', 'step': 1248, 'epoch': 1} {'type': 'loss', 'content': 0.29290473461151123, 'timestamp': '2025-09-05 08:58:48.837083', 'step': 1249, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:49.044009', 'step': 1249, 'epoch': 1} {'type': 'loss', 'content': 0.33328184485435486, 'timestamp': '2025-09-05 08:58:49.045921', 'step': 1250, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:58:49.243517', 'step': 1250, 'epoch': 1} {'type': 'loss', 'content': 0.482103168964386, 'timestamp': '2025-09-05 08:58:49.245731', 'step': 1251, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:49.451807', 'step': 1251, 'epoch': 1} {'type': 'loss', 'content': 0.4451736807823181, 'timestamp': '2025-09-05 08:58:49.465976', 'step': 1252, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:49.664436', 'step': 1252, 'epoch': 1} {'type': 'loss', 'content': 0.2903027832508087, 'timestamp': '2025-09-05 08:58:49.666700', 'step': 1253, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:49.873073', 'step': 1253, 'epoch': 1} {'type': 'loss', 'content': 0.2942162752151489, 'timestamp': '2025-09-05 08:58:49.875001', 'step': 1254, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:50.072634', 'step': 1254, 'epoch': 1} {'type': 'loss', 'content': 0.4005998969078064, 'timestamp': '2025-09-05 08:58:50.074375', 'step': 1255, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:50.273482', 'step': 1255, 'epoch': 1} {'type': 'loss', 'content': 0.3666303753852844, 'timestamp': '2025-09-05 08:58:50.290366', 'step': 1256, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:50.486596', 'step': 1256, 'epoch': 1} {'type': 'loss', 'content': 0.44452735781669617, 'timestamp': '2025-09-05 08:58:50.488547', 'step': 1257, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:58:50.689502', 'step': 1257, 'epoch': 1} {'type': 'loss', 'content': 0.3432859480381012, 'timestamp': '2025-09-05 08:58:50.691337', 'step': 1258, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:50.895348', 'step': 1258, 'epoch': 1} {'type': 'loss', 'content': 0.44465234875679016, 'timestamp': '2025-09-05 08:58:50.899575', 'step': 1259, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:51.107777', 'step': 1259, 'epoch': 1} {'type': 'loss', 'content': 0.43338504433631897, 'timestamp': '2025-09-05 08:58:51.124714', 'step': 1260, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:58:55.873912', 'step': 1260, 'epoch': 1} {'type': 'pplx', 'content': 56.86786214445365, 'timestamp': '2025-09-05 08:58:55.877053', 'step': 1260, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:56.038358', 'step': 1260, 'epoch': 1} {'type': 'loss', 'content': 0.2701398730278015, 'timestamp': '2025-09-05 08:58:56.040599', 'step': 1261, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:56.246168', 'step': 1261, 'epoch': 1} {'type': 'loss', 'content': 0.319100558757782, 'timestamp': '2025-09-05 08:58:56.248207', 'step': 1262, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:56.445175', 'step': 1262, 'epoch': 1} {'type': 'loss', 'content': 0.3955155313014984, 'timestamp': '2025-09-05 08:58:56.447043', 'step': 1263, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:56.653005', 'step': 1263, 'epoch': 1} {'type': 'loss', 'content': 0.28663066029548645, 'timestamp': '2025-09-05 08:58:56.672510', 'step': 1264, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:56.882587', 'step': 1264, 'epoch': 1} {'type': 'loss', 'content': 0.2910723090171814, 'timestamp': '2025-09-05 08:58:56.884373', 'step': 1265, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:58:57.081041', 'step': 1265, 'epoch': 1} {'type': 'loss', 'content': 0.48940280079841614, 'timestamp': '2025-09-05 08:58:57.083007', 'step': 1266, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:57.278680', 'step': 1266, 'epoch': 1} {'type': 'loss', 'content': 0.25540128350257874, 'timestamp': '2025-09-05 08:58:57.280828', 'step': 1267, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:57.478468', 'step': 1267, 'epoch': 1} {'type': 'loss', 'content': 0.4109537899494171, 'timestamp': '2025-09-05 08:58:57.492922', 'step': 1268, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:57.682652', 'step': 1268, 'epoch': 1} {'type': 'loss', 'content': 0.3179936408996582, 'timestamp': '2025-09-05 08:58:57.685638', 'step': 1269, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:57.893860', 'step': 1269, 'epoch': 1} {'type': 'loss', 'content': 0.33500999212265015, 'timestamp': '2025-09-05 08:58:57.896864', 'step': 1270, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:58.105312', 'step': 1270, 'epoch': 1} {'type': 'loss', 'content': 0.316933810710907, 'timestamp': '2025-09-05 08:58:58.107330', 'step': 1271, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:58:58.303861', 'step': 1271, 'epoch': 1} {'type': 'loss', 'content': 0.3351965844631195, 'timestamp': '2025-09-05 08:58:58.326530', 'step': 1272, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:58.515213', 'step': 1272, 'epoch': 1} {'type': 'loss', 'content': 0.42428067326545715, 'timestamp': '2025-09-05 08:58:58.517405', 'step': 1273, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:58.713550', 'step': 1273, 'epoch': 1} {'type': 'loss', 'content': 0.24712416529655457, 'timestamp': '2025-09-05 08:58:58.716283', 'step': 1274, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:58:58.916484', 'step': 1274, 'epoch': 1} {'type': 'loss', 'content': 0.25089097023010254, 'timestamp': '2025-09-05 08:58:58.921477', 'step': 1275, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:58:59.132790', 'step': 1275, 'epoch': 1} {'type': 'loss', 'content': 0.3202970325946808, 'timestamp': '2025-09-05 08:58:59.148507', 'step': 1276, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 08:58:59.342869', 'step': 1276, 'epoch': 1} {'type': 'loss', 'content': 0.27855849266052246, 'timestamp': '2025-09-05 08:58:59.344795', 'step': 1277, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:58:59.543276', 'step': 1277, 'epoch': 1} {'type': 'loss', 'content': 0.40407755970954895, 'timestamp': '2025-09-05 08:58:59.545205', 'step': 1278, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:59.749926', 'step': 1278, 'epoch': 1} {'type': 'loss', 'content': 0.1615646630525589, 'timestamp': '2025-09-05 08:58:59.759180', 'step': 1279, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:58:59.964874', 'step': 1279, 'epoch': 1} {'type': 'loss', 'content': 0.3803251385688782, 'timestamp': '2025-09-05 08:58:59.981869', 'step': 1280, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:59:04.747412', 'step': 1280, 'epoch': 1} {'type': 'pplx', 'content': 57.685687212488666, 'timestamp': '2025-09-05 08:59:04.749354', 'step': 1280, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 1280', 'timestamp': '2025-09-05 08:59:05.213901', 'step': 1280, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:05.376627', 'step': 1280, 'epoch': 1} {'type': 'loss', 'content': 0.3631496727466583, 'timestamp': '2025-09-05 08:59:05.378976', 'step': 1281, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:59:05.579929', 'step': 1281, 'epoch': 1} {'type': 'loss', 'content': 0.3576178252696991, 'timestamp': '2025-09-05 08:59:05.581880', 'step': 1282, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:05.778238', 'step': 1282, 'epoch': 1} {'type': 'loss', 'content': 0.33735018968582153, 'timestamp': '2025-09-05 08:59:05.781707', 'step': 1283, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:59:05.980579', 'step': 1283, 'epoch': 1} {'type': 'loss', 'content': 0.4075542986392975, 'timestamp': '2025-09-05 08:59:05.997002', 'step': 1284, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:06.192702', 'step': 1284, 'epoch': 1} {'type': 'loss', 'content': 0.2630583643913269, 'timestamp': '2025-09-05 08:59:06.194210', 'step': 1285, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:59:06.400762', 'step': 1285, 'epoch': 1} {'type': 'loss', 'content': 0.46530115604400635, 'timestamp': '2025-09-05 08:59:06.402542', 'step': 1286, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:06.608888', 'step': 1286, 'epoch': 1} {'type': 'loss', 'content': 0.3795335292816162, 'timestamp': '2025-09-05 08:59:06.610712', 'step': 1287, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 08:59:06.828383', 'step': 1287, 'epoch': 1} {'type': 'loss', 'content': 0.43324583768844604, 'timestamp': '2025-09-05 08:59:06.842845', 'step': 1288, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:07.031687', 'step': 1288, 'epoch': 1} {'type': 'loss', 'content': 0.2171895056962967, 'timestamp': '2025-09-05 08:59:07.033567', 'step': 1289, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:07.229948', 'step': 1289, 'epoch': 1} {'type': 'loss', 'content': 0.25000834465026855, 'timestamp': '2025-09-05 08:59:07.231777', 'step': 1290, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:07.427444', 'step': 1290, 'epoch': 1} {'type': 'loss', 'content': 0.2618256211280823, 'timestamp': '2025-09-05 08:59:07.429500', 'step': 1291, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:07.635328', 'step': 1291, 'epoch': 1} {'type': 'loss', 'content': 0.395792156457901, 'timestamp': '2025-09-05 08:59:07.652426', 'step': 1292, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:07.849809', 'step': 1292, 'epoch': 1} {'type': 'loss', 'content': 0.3665156066417694, 'timestamp': '2025-09-05 08:59:07.852069', 'step': 1293, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:08.048771', 'step': 1293, 'epoch': 1} {'type': 'loss', 'content': 0.38869205117225647, 'timestamp': '2025-09-05 08:59:08.050665', 'step': 1294, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:08.217638', 'step': 1294, 'epoch': 1} {'type': 'loss', 'content': 0.27352771162986755, 'timestamp': '2025-09-05 08:59:08.222770', 'step': 1295, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:59:08.441962', 'step': 1295, 'epoch': 1} {'type': 'loss', 'content': 0.3475234806537628, 'timestamp': '2025-09-05 08:59:08.456264', 'step': 1296, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:08.643690', 'step': 1296, 'epoch': 1} {'type': 'loss', 'content': 0.4448797106742859, 'timestamp': '2025-09-05 08:59:08.645368', 'step': 1297, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:59:08.839667', 'step': 1297, 'epoch': 1} {'type': 'loss', 'content': 0.27712783217430115, 'timestamp': '2025-09-05 08:59:08.841451', 'step': 1298, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:09.046283', 'step': 1298, 'epoch': 1} {'type': 'loss', 'content': 0.2694713771343231, 'timestamp': '2025-09-05 08:59:09.048220', 'step': 1299, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:59:09.242540', 'step': 1299, 'epoch': 1} {'type': 'loss', 'content': 0.194560706615448, 'timestamp': '2025-09-05 08:59:09.257384', 'step': 1300, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:59:13.911796', 'step': 1300, 'epoch': 1} {'type': 'pplx', 'content': 57.60386166296047, 'timestamp': '2025-09-05 08:59:13.915390', 'step': 1300, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:14.080096', 'step': 1300, 'epoch': 2} {'type': 'loss', 'content': 0.3144618272781372, 'timestamp': '2025-09-05 08:59:14.082040', 'step': 1301, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:59:14.252081', 'step': 1301, 'epoch': 2} {'type': 'loss', 'content': 0.3946076035499573, 'timestamp': '2025-09-05 08:59:14.254065', 'step': 1302, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:59:14.458505', 'step': 1302, 'epoch': 2} {'type': 'loss', 'content': 0.2931336760520935, 'timestamp': '2025-09-05 08:59:14.460293', 'step': 1303, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:14.667262', 'step': 1303, 'epoch': 2} {'type': 'loss', 'content': 0.4380492568016052, 'timestamp': '2025-09-05 08:59:14.684802', 'step': 1304, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:14.883125', 'step': 1304, 'epoch': 2} {'type': 'loss', 'content': 0.26544374227523804, 'timestamp': '2025-09-05 08:59:14.885058', 'step': 1305, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:15.090269', 'step': 1305, 'epoch': 2} {'type': 'loss', 'content': 0.363553911447525, 'timestamp': '2025-09-05 08:59:15.091915', 'step': 1306, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:15.287843', 'step': 1306, 'epoch': 2} {'type': 'loss', 'content': 0.3278323709964752, 'timestamp': '2025-09-05 08:59:15.289648', 'step': 1307, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:15.495949', 'step': 1307, 'epoch': 2} {'type': 'loss', 'content': 0.49476808309555054, 'timestamp': '2025-09-05 08:59:15.505766', 'step': 1308, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:15.669479', 'step': 1308, 'epoch': 2} {'type': 'loss', 'content': 0.4298350214958191, 'timestamp': '2025-09-05 08:59:15.671595', 'step': 1309, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:15.876381', 'step': 1309, 'epoch': 2} {'type': 'loss', 'content': 0.41758593916893005, 'timestamp': '2025-09-05 08:59:15.878324', 'step': 1310, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:16.074306', 'step': 1310, 'epoch': 2} {'type': 'loss', 'content': 0.43739357590675354, 'timestamp': '2025-09-05 08:59:16.076299', 'step': 1311, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:16.282501', 'step': 1311, 'epoch': 2} {'type': 'loss', 'content': 0.44980770349502563, 'timestamp': '2025-09-05 08:59:16.300777', 'step': 1312, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:16.492159', 'step': 1312, 'epoch': 2} {'type': 'loss', 'content': 0.3103547692298889, 'timestamp': '2025-09-05 08:59:16.494221', 'step': 1313, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:16.701249', 'step': 1313, 'epoch': 2} {'type': 'loss', 'content': 0.42775505781173706, 'timestamp': '2025-09-05 08:59:16.703236', 'step': 1314, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:16.910343', 'step': 1314, 'epoch': 2} {'type': 'loss', 'content': 0.2993074059486389, 'timestamp': '2025-09-05 08:59:16.913368', 'step': 1315, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:59:17.108812', 'step': 1315, 'epoch': 2} {'type': 'loss', 'content': 0.3599098324775696, 'timestamp': '2025-09-05 08:59:17.123094', 'step': 1316, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:17.312042', 'step': 1316, 'epoch': 2} {'type': 'loss', 'content': 0.3977486491203308, 'timestamp': '2025-09-05 08:59:17.313889', 'step': 1317, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:17.510084', 'step': 1317, 'epoch': 2} {'type': 'loss', 'content': 0.3357473611831665, 'timestamp': '2025-09-05 08:59:17.512160', 'step': 1318, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:59:17.709325', 'step': 1318, 'epoch': 2} {'type': 'loss', 'content': 0.309467077255249, 'timestamp': '2025-09-05 08:59:17.711246', 'step': 1319, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:17.908970', 'step': 1319, 'epoch': 2} {'type': 'loss', 'content': 0.29499152302742004, 'timestamp': '2025-09-05 08:59:17.923049', 'step': 1320, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:59:22.624294', 'step': 1320, 'epoch': 2} {'type': 'pplx', 'content': 57.36183768004392, 'timestamp': '2025-09-05 08:59:22.626617', 'step': 1320, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1320', 'timestamp': '2025-09-05 08:59:23.087989', 'step': 1320, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:23.269735', 'step': 1320, 'epoch': 2} {'type': 'loss', 'content': 0.44477683305740356, 'timestamp': '2025-09-05 08:59:23.272494', 'step': 1321, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:59:23.440651', 'step': 1321, 'epoch': 2} {'type': 'loss', 'content': 0.46572038531303406, 'timestamp': '2025-09-05 08:59:23.443718', 'step': 1322, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:23.650943', 'step': 1322, 'epoch': 2} {'type': 'loss', 'content': 0.27906832098960876, 'timestamp': '2025-09-05 08:59:23.653252', 'step': 1323, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:59:23.865736', 'step': 1323, 'epoch': 2} {'type': 'loss', 'content': 0.4312252104282379, 'timestamp': '2025-09-05 08:59:23.880583', 'step': 1324, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:24.071014', 'step': 1324, 'epoch': 2} {'type': 'loss', 'content': 0.30487996339797974, 'timestamp': '2025-09-05 08:59:24.074126', 'step': 1325, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:24.244295', 'step': 1325, 'epoch': 2} {'type': 'loss', 'content': 0.3292028307914734, 'timestamp': '2025-09-05 08:59:24.247257', 'step': 1326, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:24.452913', 'step': 1326, 'epoch': 2} {'type': 'loss', 'content': 0.3268643915653229, 'timestamp': '2025-09-05 08:59:24.455149', 'step': 1327, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:24.651837', 'step': 1327, 'epoch': 2} {'type': 'loss', 'content': 0.39009836316108704, 'timestamp': '2025-09-05 08:59:24.666828', 'step': 1328, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:24.856475', 'step': 1328, 'epoch': 2} {'type': 'loss', 'content': 0.33519843220710754, 'timestamp': '2025-09-05 08:59:24.858850', 'step': 1329, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:25.063589', 'step': 1329, 'epoch': 2} {'type': 'loss', 'content': 0.28642693161964417, 'timestamp': '2025-09-05 08:59:25.065551', 'step': 1330, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:59:25.278910', 'step': 1330, 'epoch': 2} {'type': 'loss', 'content': 0.44241082668304443, 'timestamp': '2025-09-05 08:59:25.280946', 'step': 1331, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:59:25.479903', 'step': 1331, 'epoch': 2} {'type': 'loss', 'content': 0.33670666813850403, 'timestamp': '2025-09-05 08:59:25.494610', 'step': 1332, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:25.683317', 'step': 1332, 'epoch': 2} {'type': 'loss', 'content': 0.2926885485649109, 'timestamp': '2025-09-05 08:59:25.685308', 'step': 1333, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:25.891586', 'step': 1333, 'epoch': 2} {'type': 'loss', 'content': 0.3921295404434204, 'timestamp': '2025-09-05 08:59:25.894116', 'step': 1334, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:26.100092', 'step': 1334, 'epoch': 2} {'type': 'loss', 'content': 0.37950482964515686, 'timestamp': '2025-09-05 08:59:26.102003', 'step': 1335, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:26.299905', 'step': 1335, 'epoch': 2} {'type': 'loss', 'content': 0.4223680794239044, 'timestamp': '2025-09-05 08:59:26.314409', 'step': 1336, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:26.503230', 'step': 1336, 'epoch': 2} {'type': 'loss', 'content': 0.4278903603553772, 'timestamp': '2025-09-05 08:59:26.505314', 'step': 1337, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:26.712369', 'step': 1337, 'epoch': 2} {'type': 'loss', 'content': 0.3812592923641205, 'timestamp': '2025-09-05 08:59:26.714473', 'step': 1338, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:26.921242', 'step': 1338, 'epoch': 2} {'type': 'loss', 'content': 0.3434469997882843, 'timestamp': '2025-09-05 08:59:26.925060', 'step': 1339, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:59:27.124683', 'step': 1339, 'epoch': 2} {'type': 'loss', 'content': 0.2850334346294403, 'timestamp': '2025-09-05 08:59:27.139548', 'step': 1340, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:59:31.890740', 'step': 1340, 'epoch': 2} {'type': 'pplx', 'content': 56.91573223377922, 'timestamp': '2025-09-05 08:59:31.893487', 'step': 1340, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:32.055369', 'step': 1340, 'epoch': 2} {'type': 'loss', 'content': 0.3462655246257782, 'timestamp': '2025-09-05 08:59:32.057646', 'step': 1341, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:32.262517', 'step': 1341, 'epoch': 2} {'type': 'loss', 'content': 0.346880704164505, 'timestamp': '2025-09-05 08:59:32.264638', 'step': 1342, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:32.463118', 'step': 1342, 'epoch': 2} {'type': 'loss', 'content': 0.3966115415096283, 'timestamp': '2025-09-05 08:59:32.465165', 'step': 1343, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:32.671225', 'step': 1343, 'epoch': 2} {'type': 'loss', 'content': 0.33604347705841064, 'timestamp': '2025-09-05 08:59:32.688689', 'step': 1344, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:32.885583', 'step': 1344, 'epoch': 2} {'type': 'loss', 'content': 0.40588828921318054, 'timestamp': '2025-09-05 08:59:32.887860', 'step': 1345, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:33.085720', 'step': 1345, 'epoch': 2} {'type': 'loss', 'content': 0.24824795126914978, 'timestamp': '2025-09-05 08:59:33.089596', 'step': 1346, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:33.306741', 'step': 1346, 'epoch': 2} {'type': 'loss', 'content': 0.37973007559776306, 'timestamp': '2025-09-05 08:59:33.309131', 'step': 1347, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:33.517134', 'step': 1347, 'epoch': 2} {'type': 'loss', 'content': 0.46875327825546265, 'timestamp': '2025-09-05 08:59:33.534015', 'step': 1348, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:59:33.737292', 'step': 1348, 'epoch': 2} {'type': 'loss', 'content': 0.315308541059494, 'timestamp': '2025-09-05 08:59:33.739605', 'step': 1349, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:59:33.939497', 'step': 1349, 'epoch': 2} {'type': 'loss', 'content': 0.40450769662857056, 'timestamp': '2025-09-05 08:59:33.941817', 'step': 1350, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:34.148841', 'step': 1350, 'epoch': 2} {'type': 'loss', 'content': 0.33537474274635315, 'timestamp': '2025-09-05 08:59:34.151279', 'step': 1351, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:34.353877', 'step': 1351, 'epoch': 2} {'type': 'loss', 'content': 0.4013102948665619, 'timestamp': '2025-09-05 08:59:34.373704', 'step': 1352, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:34.571837', 'step': 1352, 'epoch': 2} {'type': 'loss', 'content': 0.37566959857940674, 'timestamp': '2025-09-05 08:59:34.574167', 'step': 1353, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:59:34.772021', 'step': 1353, 'epoch': 2} {'type': 'loss', 'content': 0.2328030914068222, 'timestamp': '2025-09-05 08:59:34.774460', 'step': 1354, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:59:34.982169', 'step': 1354, 'epoch': 2} {'type': 'loss', 'content': 0.40998750925064087, 'timestamp': '2025-09-05 08:59:34.984549', 'step': 1355, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:35.198027', 'step': 1355, 'epoch': 2} {'type': 'loss', 'content': 0.3128526210784912, 'timestamp': '2025-09-05 08:59:35.214422', 'step': 1356, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:35.405238', 'step': 1356, 'epoch': 2} {'type': 'loss', 'content': 0.26632222533226013, 'timestamp': '2025-09-05 08:59:35.407784', 'step': 1357, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:35.615592', 'step': 1357, 'epoch': 2} {'type': 'loss', 'content': 0.4280310273170471, 'timestamp': '2025-09-05 08:59:35.617878', 'step': 1358, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:59:35.831407', 'step': 1358, 'epoch': 2} {'type': 'loss', 'content': 0.3956908881664276, 'timestamp': '2025-09-05 08:59:35.833787', 'step': 1359, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:36.041582', 'step': 1359, 'epoch': 2} {'type': 'loss', 'content': 0.3153320550918579, 'timestamp': '2025-09-05 08:59:36.056665', 'step': 1360, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:59:40.819925', 'step': 1360, 'epoch': 2} {'type': 'pplx', 'content': 56.70963319815915, 'timestamp': '2025-09-05 08:59:40.823587', 'step': 1360, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1360', 'timestamp': '2025-09-05 08:59:41.503796', 'step': 1360, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:41.682641', 'step': 1360, 'epoch': 2} {'type': 'loss', 'content': 0.28595757484436035, 'timestamp': '2025-09-05 08:59:41.685944', 'step': 1361, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:41.882882', 'step': 1361, 'epoch': 2} {'type': 'loss', 'content': 0.24493901431560516, 'timestamp': '2025-09-05 08:59:41.885326', 'step': 1362, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:59:42.085395', 'step': 1362, 'epoch': 2} {'type': 'loss', 'content': 0.5457180142402649, 'timestamp': '2025-09-05 08:59:42.088843', 'step': 1363, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:42.297550', 'step': 1363, 'epoch': 2} {'type': 'loss', 'content': 0.4040033221244812, 'timestamp': '2025-09-05 08:59:42.311680', 'step': 1364, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:59:42.500508', 'step': 1364, 'epoch': 2} {'type': 'loss', 'content': 0.45766469836235046, 'timestamp': '2025-09-05 08:59:42.504774', 'step': 1365, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:42.715436', 'step': 1365, 'epoch': 2} {'type': 'loss', 'content': 0.3269166648387909, 'timestamp': '2025-09-05 08:59:42.717414', 'step': 1366, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:42.928316', 'step': 1366, 'epoch': 2} {'type': 'loss', 'content': 0.4435346722602844, 'timestamp': '2025-09-05 08:59:42.930765', 'step': 1367, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:43.139410', 'step': 1367, 'epoch': 2} {'type': 'loss', 'content': 0.30959320068359375, 'timestamp': '2025-09-05 08:59:43.153945', 'step': 1368, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:43.344550', 'step': 1368, 'epoch': 2} {'type': 'loss', 'content': 0.3181923031806946, 'timestamp': '2025-09-05 08:59:43.348085', 'step': 1369, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:43.550510', 'step': 1369, 'epoch': 2} {'type': 'loss', 'content': 0.3809313476085663, 'timestamp': '2025-09-05 08:59:43.554965', 'step': 1370, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:59:43.764296', 'step': 1370, 'epoch': 2} {'type': 'loss', 'content': 0.3070775270462036, 'timestamp': '2025-09-05 08:59:43.765849', 'step': 1371, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:43.970865', 'step': 1371, 'epoch': 2} {'type': 'loss', 'content': 0.3441022038459778, 'timestamp': '2025-09-05 08:59:43.987786', 'step': 1372, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:59:44.196406', 'step': 1372, 'epoch': 2} {'type': 'loss', 'content': 0.48999282717704773, 'timestamp': '2025-09-05 08:59:44.199384', 'step': 1373, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:59:44.400537', 'step': 1373, 'epoch': 2} {'type': 'loss', 'content': 0.3837975561618805, 'timestamp': '2025-09-05 08:59:44.402764', 'step': 1374, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:59:44.602234', 'step': 1374, 'epoch': 2} {'type': 'loss', 'content': 0.29959526658058167, 'timestamp': '2025-09-05 08:59:44.606416', 'step': 1375, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:44.820077', 'step': 1375, 'epoch': 2} {'type': 'loss', 'content': 0.3829365372657776, 'timestamp': '2025-09-05 08:59:44.836407', 'step': 1376, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:45.029202', 'step': 1376, 'epoch': 2} {'type': 'loss', 'content': 0.3905230164527893, 'timestamp': '2025-09-05 08:59:45.031717', 'step': 1377, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:59:45.228189', 'step': 1377, 'epoch': 2} {'type': 'loss', 'content': 0.2604524493217468, 'timestamp': '2025-09-05 08:59:45.230920', 'step': 1378, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:45.401870', 'step': 1378, 'epoch': 2} {'type': 'loss', 'content': 0.2648540437221527, 'timestamp': '2025-09-05 08:59:45.403891', 'step': 1379, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:59:45.610398', 'step': 1379, 'epoch': 2} {'type': 'loss', 'content': 0.25715941190719604, 'timestamp': '2025-09-05 08:59:45.625180', 'step': 1380, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 08:59:50.612485', 'step': 1380, 'epoch': 2} {'type': 'pplx', 'content': 56.62276512044233, 'timestamp': '2025-09-05 08:59:50.616571', 'step': 1380, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:50.791068', 'step': 1380, 'epoch': 2} {'type': 'loss', 'content': 0.39979779720306396, 'timestamp': '2025-09-05 08:59:50.792966', 'step': 1381, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:50.997964', 'step': 1381, 'epoch': 2} {'type': 'loss', 'content': 0.3293604254722595, 'timestamp': '2025-09-05 08:59:51.001515', 'step': 1382, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 08:59:51.214343', 'step': 1382, 'epoch': 2} {'type': 'loss', 'content': 0.4426141679286957, 'timestamp': '2025-09-05 08:59:51.216786', 'step': 1383, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:51.477127', 'step': 1383, 'epoch': 2} {'type': 'loss', 'content': 0.4094974994659424, 'timestamp': '2025-09-05 08:59:51.492213', 'step': 1384, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:51.692197', 'step': 1384, 'epoch': 2} {'type': 'loss', 'content': 0.3006475865840912, 'timestamp': '2025-09-05 08:59:51.694616', 'step': 1385, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:51.893460', 'step': 1385, 'epoch': 2} {'type': 'loss', 'content': 0.3527871370315552, 'timestamp': '2025-09-05 08:59:51.895681', 'step': 1386, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:52.108314', 'step': 1386, 'epoch': 2} {'type': 'loss', 'content': 0.4060435891151428, 'timestamp': '2025-09-05 08:59:52.113186', 'step': 1387, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:59:52.315169', 'step': 1387, 'epoch': 2} {'type': 'loss', 'content': 0.31812533736228943, 'timestamp': '2025-09-05 08:59:52.329965', 'step': 1388, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 08:59:52.523118', 'step': 1388, 'epoch': 2} {'type': 'loss', 'content': 0.4235638976097107, 'timestamp': '2025-09-05 08:59:52.525559', 'step': 1389, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:52.765252', 'step': 1389, 'epoch': 2} {'type': 'loss', 'content': 0.3670898377895355, 'timestamp': '2025-09-05 08:59:52.768592', 'step': 1390, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:52.967645', 'step': 1390, 'epoch': 2} {'type': 'loss', 'content': 0.3166564404964447, 'timestamp': '2025-09-05 08:59:52.969921', 'step': 1391, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:53.171302', 'step': 1391, 'epoch': 2} {'type': 'loss', 'content': 0.3685086965560913, 'timestamp': '2025-09-05 08:59:53.189842', 'step': 1392, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:53.392408', 'step': 1392, 'epoch': 2} {'type': 'loss', 'content': 0.24257375299930573, 'timestamp': '2025-09-05 08:59:53.394129', 'step': 1393, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:53.602965', 'step': 1393, 'epoch': 2} {'type': 'loss', 'content': 0.4437284767627716, 'timestamp': '2025-09-05 08:59:53.605553', 'step': 1394, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 08:59:53.885427', 'step': 1394, 'epoch': 2} {'type': 'loss', 'content': 0.44088515639305115, 'timestamp': '2025-09-05 08:59:53.888345', 'step': 1395, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 08:59:54.104505', 'step': 1395, 'epoch': 2} {'type': 'loss', 'content': 0.47625863552093506, 'timestamp': '2025-09-05 08:59:54.123048', 'step': 1396, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 08:59:54.327570', 'step': 1396, 'epoch': 2} {'type': 'loss', 'content': 0.46532103419303894, 'timestamp': '2025-09-05 08:59:54.330291', 'step': 1397, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:54.532148', 'step': 1397, 'epoch': 2} {'type': 'loss', 'content': 0.32851770520210266, 'timestamp': '2025-09-05 08:59:54.534121', 'step': 1398, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:54.827163', 'step': 1398, 'epoch': 2} {'type': 'loss', 'content': 0.5156394243240356, 'timestamp': '2025-09-05 08:59:54.829390', 'step': 1399, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 08:59:55.038664', 'step': 1399, 'epoch': 2} {'type': 'loss', 'content': 0.33277690410614014, 'timestamp': '2025-09-05 08:59:55.054002', 'step': 1400, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:00:00.092698', 'step': 1400, 'epoch': 2} {'type': 'pplx', 'content': 56.81417784437169, 'timestamp': '2025-09-05 09:00:00.095367', 'step': 1400, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1400', 'timestamp': '2025-09-05 09:00:00.596593', 'step': 1400, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:00.791451', 'step': 1400, 'epoch': 2} {'type': 'loss', 'content': 0.25804686546325684, 'timestamp': '2025-09-05 09:00:00.794003', 'step': 1401, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:00:01.002420', 'step': 1401, 'epoch': 2} {'type': 'loss', 'content': 0.31888291239738464, 'timestamp': '2025-09-05 09:00:01.005189', 'step': 1402, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:01.208357', 'step': 1402, 'epoch': 2} {'type': 'loss', 'content': 0.36028096079826355, 'timestamp': '2025-09-05 09:00:01.210255', 'step': 1403, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:01.414257', 'step': 1403, 'epoch': 2} {'type': 'loss', 'content': 0.2646711766719818, 'timestamp': '2025-09-05 09:00:01.428985', 'step': 1404, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:01.625056', 'step': 1404, 'epoch': 2} {'type': 'loss', 'content': 0.3558668792247772, 'timestamp': '2025-09-05 09:00:01.627352', 'step': 1405, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:01.824547', 'step': 1405, 'epoch': 2} {'type': 'loss', 'content': 0.3586845099925995, 'timestamp': '2025-09-05 09:00:01.827510', 'step': 1406, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:02.029391', 'step': 1406, 'epoch': 2} {'type': 'loss', 'content': 0.20658056437969208, 'timestamp': '2025-09-05 09:00:02.032797', 'step': 1407, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:02.243681', 'step': 1407, 'epoch': 2} {'type': 'loss', 'content': 0.3639076352119446, 'timestamp': '2025-09-05 09:00:02.260804', 'step': 1408, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:00:02.462268', 'step': 1408, 'epoch': 2} {'type': 'loss', 'content': 0.26506686210632324, 'timestamp': '2025-09-05 09:00:02.465508', 'step': 1409, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:02.664380', 'step': 1409, 'epoch': 2} {'type': 'loss', 'content': 0.25214824080467224, 'timestamp': '2025-09-05 09:00:02.667383', 'step': 1410, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:02.876346', 'step': 1410, 'epoch': 2} {'type': 'loss', 'content': 0.31711870431900024, 'timestamp': '2025-09-05 09:00:02.878119', 'step': 1411, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:03.088023', 'step': 1411, 'epoch': 2} {'type': 'loss', 'content': 0.5284944772720337, 'timestamp': '2025-09-05 09:00:03.102500', 'step': 1412, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:03.300795', 'step': 1412, 'epoch': 2} {'type': 'loss', 'content': 0.3170612156391144, 'timestamp': '2025-09-05 09:00:03.302474', 'step': 1413, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:00:03.511578', 'step': 1413, 'epoch': 2} {'type': 'loss', 'content': 0.19468443095684052, 'timestamp': '2025-09-05 09:00:03.513839', 'step': 1414, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:03.767100', 'step': 1414, 'epoch': 2} {'type': 'loss', 'content': 0.26165586709976196, 'timestamp': '2025-09-05 09:00:03.811286', 'step': 1415, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:04.059572', 'step': 1415, 'epoch': 2} {'type': 'loss', 'content': 0.32013124227523804, 'timestamp': '2025-09-05 09:00:04.069440', 'step': 1416, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:04.236919', 'step': 1416, 'epoch': 2} {'type': 'loss', 'content': 0.36106082797050476, 'timestamp': '2025-09-05 09:00:04.240225', 'step': 1417, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:04.455188', 'step': 1417, 'epoch': 2} {'type': 'loss', 'content': 0.3524361252784729, 'timestamp': '2025-09-05 09:00:04.458212', 'step': 1418, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:00:04.725938', 'step': 1418, 'epoch': 2} {'type': 'loss', 'content': 0.25635817646980286, 'timestamp': '2025-09-05 09:00:04.728692', 'step': 1419, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:04.941135', 'step': 1419, 'epoch': 2} {'type': 'loss', 'content': 0.24682959914207458, 'timestamp': '2025-09-05 09:00:04.957692', 'step': 1420, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:00:10.139186', 'step': 1420, 'epoch': 2} {'type': 'pplx', 'content': 57.352158708318434, 'timestamp': '2025-09-05 09:00:10.142827', 'step': 1420, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:00:10.304786', 'step': 1420, 'epoch': 2} {'type': 'loss', 'content': 0.40001311898231506, 'timestamp': '2025-09-05 09:00:10.308705', 'step': 1421, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:10.507444', 'step': 1421, 'epoch': 2} {'type': 'loss', 'content': 0.30492401123046875, 'timestamp': '2025-09-05 09:00:10.536910', 'step': 1422, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:00:10.790334', 'step': 1422, 'epoch': 2} {'type': 'loss', 'content': 0.3323606252670288, 'timestamp': '2025-09-05 09:00:10.792575', 'step': 1423, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:00:11.006202', 'step': 1423, 'epoch': 2} {'type': 'loss', 'content': 0.23769192397594452, 'timestamp': '2025-09-05 09:00:11.021212', 'step': 1424, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:11.234231', 'step': 1424, 'epoch': 2} {'type': 'loss', 'content': 0.3142143189907074, 'timestamp': '2025-09-05 09:00:11.237890', 'step': 1425, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:11.531231', 'step': 1425, 'epoch': 2} {'type': 'loss', 'content': 0.3281928300857544, 'timestamp': '2025-09-05 09:00:11.532868', 'step': 1426, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:11.742869', 'step': 1426, 'epoch': 2} {'type': 'loss', 'content': 0.4385247528553009, 'timestamp': '2025-09-05 09:00:11.744405', 'step': 1427, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:11.943393', 'step': 1427, 'epoch': 2} {'type': 'loss', 'content': 0.3019809126853943, 'timestamp': '2025-09-05 09:00:12.000833', 'step': 1428, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:00:12.245974', 'step': 1428, 'epoch': 2} {'type': 'loss', 'content': 0.374027818441391, 'timestamp': '2025-09-05 09:00:12.247971', 'step': 1429, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:12.445020', 'step': 1429, 'epoch': 2} {'type': 'loss', 'content': 0.2926400601863861, 'timestamp': '2025-09-05 09:00:12.446711', 'step': 1430, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:12.616677', 'step': 1430, 'epoch': 2} {'type': 'loss', 'content': 0.2943369150161743, 'timestamp': '2025-09-05 09:00:12.618732', 'step': 1431, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:12.828550', 'step': 1431, 'epoch': 2} {'type': 'loss', 'content': 0.25276631116867065, 'timestamp': '2025-09-05 09:00:12.838911', 'step': 1432, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:00:13.004531', 'step': 1432, 'epoch': 2} {'type': 'loss', 'content': 0.47622016072273254, 'timestamp': '2025-09-05 09:00:13.006782', 'step': 1433, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:13.213452', 'step': 1433, 'epoch': 2} {'type': 'loss', 'content': 0.3429078161716461, 'timestamp': '2025-09-05 09:00:13.215745', 'step': 1434, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:13.423244', 'step': 1434, 'epoch': 2} {'type': 'loss', 'content': 0.2541375756263733, 'timestamp': '2025-09-05 09:00:13.425629', 'step': 1435, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:13.622643', 'step': 1435, 'epoch': 2} {'type': 'loss', 'content': 0.42097604274749756, 'timestamp': '2025-09-05 09:00:13.638488', 'step': 1436, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:13.834396', 'step': 1436, 'epoch': 2} {'type': 'loss', 'content': 0.29442155361175537, 'timestamp': '2025-09-05 09:00:13.838219', 'step': 1437, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:14.051866', 'step': 1437, 'epoch': 2} {'type': 'loss', 'content': 0.32426849007606506, 'timestamp': '2025-09-05 09:00:14.054791', 'step': 1438, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:00:14.267023', 'step': 1438, 'epoch': 2} {'type': 'loss', 'content': 0.3755996525287628, 'timestamp': '2025-09-05 09:00:14.269523', 'step': 1439, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:14.476738', 'step': 1439, 'epoch': 2} {'type': 'loss', 'content': 0.2896839678287506, 'timestamp': '2025-09-05 09:00:14.492129', 'step': 1440, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:00:19.555529', 'step': 1440, 'epoch': 2} {'type': 'pplx', 'content': 57.297274431293886, 'timestamp': '2025-09-05 09:00:19.558449', 'step': 1440, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1440', 'timestamp': '2025-09-05 09:00:20.192961', 'step': 1440, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:20.379456', 'step': 1440, 'epoch': 2} {'type': 'loss', 'content': 0.3088245391845703, 'timestamp': '2025-09-05 09:00:20.381264', 'step': 1441, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:20.582334', 'step': 1441, 'epoch': 2} {'type': 'loss', 'content': 0.40866950154304504, 'timestamp': '2025-09-05 09:00:20.584803', 'step': 1442, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:20.783709', 'step': 1442, 'epoch': 2} {'type': 'loss', 'content': 0.40102720260620117, 'timestamp': '2025-09-05 09:00:20.787143', 'step': 1443, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:00:20.985647', 'step': 1443, 'epoch': 2} {'type': 'loss', 'content': 0.29152411222457886, 'timestamp': '2025-09-05 09:00:21.001648', 'step': 1444, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:21.248024', 'step': 1444, 'epoch': 2} {'type': 'loss', 'content': 0.35733139514923096, 'timestamp': '2025-09-05 09:00:21.250634', 'step': 1445, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:21.560840', 'step': 1445, 'epoch': 2} {'type': 'loss', 'content': 0.2699089050292969, 'timestamp': '2025-09-05 09:00:21.563859', 'step': 1446, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:21.762868', 'step': 1446, 'epoch': 2} {'type': 'loss', 'content': 0.39894458651542664, 'timestamp': '2025-09-05 09:00:21.765328', 'step': 1447, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:00:21.973633', 'step': 1447, 'epoch': 2} {'type': 'loss', 'content': 0.31270933151245117, 'timestamp': '2025-09-05 09:00:21.988487', 'step': 1448, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:00:22.181427', 'step': 1448, 'epoch': 2} {'type': 'loss', 'content': 0.468645304441452, 'timestamp': '2025-09-05 09:00:22.183887', 'step': 1449, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:22.382002', 'step': 1449, 'epoch': 2} {'type': 'loss', 'content': 0.2627173066139221, 'timestamp': '2025-09-05 09:00:22.384074', 'step': 1450, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:00:22.590432', 'step': 1450, 'epoch': 2} {'type': 'loss', 'content': 0.21120664477348328, 'timestamp': '2025-09-05 09:00:22.593672', 'step': 1451, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:00:22.845576', 'step': 1451, 'epoch': 2} {'type': 'loss', 'content': 0.37008100748062134, 'timestamp': '2025-09-05 09:00:22.860746', 'step': 1452, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:23.057719', 'step': 1452, 'epoch': 2} {'type': 'loss', 'content': 0.2987282872200012, 'timestamp': '2025-09-05 09:00:23.060153', 'step': 1453, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:00:23.328956', 'step': 1453, 'epoch': 2} {'type': 'loss', 'content': 0.4526394307613373, 'timestamp': '2025-09-05 09:00:23.330758', 'step': 1454, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:23.532425', 'step': 1454, 'epoch': 2} {'type': 'loss', 'content': 0.4313774108886719, 'timestamp': '2025-09-05 09:00:23.535905', 'step': 1455, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:23.738330', 'step': 1455, 'epoch': 2} {'type': 'loss', 'content': 0.3727917969226837, 'timestamp': '2025-09-05 09:00:23.754096', 'step': 1456, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:23.956198', 'step': 1456, 'epoch': 2} {'type': 'loss', 'content': 0.3371935486793518, 'timestamp': '2025-09-05 09:00:23.958709', 'step': 1457, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:24.156841', 'step': 1457, 'epoch': 2} {'type': 'loss', 'content': 0.29328054189682007, 'timestamp': '2025-09-05 09:00:24.158646', 'step': 1458, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:00:24.387752', 'step': 1458, 'epoch': 2} {'type': 'loss', 'content': 0.4609324336051941, 'timestamp': '2025-09-05 09:00:24.389694', 'step': 1459, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:24.586114', 'step': 1459, 'epoch': 2} {'type': 'loss', 'content': 0.3523360788822174, 'timestamp': '2025-09-05 09:00:24.600107', 'step': 1460, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:00:29.592360', 'step': 1460, 'epoch': 2} {'type': 'pplx', 'content': 57.27989146436504, 'timestamp': '2025-09-05 09:00:29.594078', 'step': 1460, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:29.757891', 'step': 1460, 'epoch': 2} {'type': 'loss', 'content': 0.272165983915329, 'timestamp': '2025-09-05 09:00:29.760268', 'step': 1461, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:00:29.965527', 'step': 1461, 'epoch': 2} {'type': 'loss', 'content': 0.4126870334148407, 'timestamp': '2025-09-05 09:00:29.967387', 'step': 1462, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:00:30.219610', 'step': 1462, 'epoch': 2} {'type': 'loss', 'content': 0.3365689218044281, 'timestamp': '2025-09-05 09:00:30.221690', 'step': 1463, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:30.419896', 'step': 1463, 'epoch': 2} {'type': 'loss', 'content': 0.3105847239494324, 'timestamp': '2025-09-05 09:00:30.434605', 'step': 1464, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:30.624678', 'step': 1464, 'epoch': 2} {'type': 'loss', 'content': 0.3041870594024658, 'timestamp': '2025-09-05 09:00:30.669491', 'step': 1465, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:30.908522', 'step': 1465, 'epoch': 2} {'type': 'loss', 'content': 0.21601124107837677, 'timestamp': '2025-09-05 09:00:30.951473', 'step': 1466, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:31.201263', 'step': 1466, 'epoch': 2} {'type': 'loss', 'content': 0.329571008682251, 'timestamp': '2025-09-05 09:00:31.203271', 'step': 1467, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:31.454663', 'step': 1467, 'epoch': 2} {'type': 'loss', 'content': 0.3269669711589813, 'timestamp': '2025-09-05 09:00:31.469803', 'step': 1468, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:31.664022', 'step': 1468, 'epoch': 2} {'type': 'loss', 'content': 0.45828115940093994, 'timestamp': '2025-09-05 09:00:31.666165', 'step': 1469, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:00:31.905612', 'step': 1469, 'epoch': 2} {'type': 'loss', 'content': 0.3074597716331482, 'timestamp': '2025-09-05 09:00:31.907252', 'step': 1470, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:32.113377', 'step': 1470, 'epoch': 2} {'type': 'loss', 'content': 0.45386582612991333, 'timestamp': '2025-09-05 09:00:32.115039', 'step': 1471, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:00:32.323220', 'step': 1471, 'epoch': 2} {'type': 'loss', 'content': 0.31208494305610657, 'timestamp': '2025-09-05 09:00:32.340131', 'step': 1472, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:00:32.539778', 'step': 1472, 'epoch': 2} {'type': 'loss', 'content': 0.30297181010246277, 'timestamp': '2025-09-05 09:00:32.546254', 'step': 1473, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:32.754227', 'step': 1473, 'epoch': 2} {'type': 'loss', 'content': 0.24466463923454285, 'timestamp': '2025-09-05 09:00:32.755965', 'step': 1474, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:33.051143', 'step': 1474, 'epoch': 2} {'type': 'loss', 'content': 0.22954264283180237, 'timestamp': '2025-09-05 09:00:33.052843', 'step': 1475, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:33.249729', 'step': 1475, 'epoch': 2} {'type': 'loss', 'content': 0.2577112317085266, 'timestamp': '2025-09-05 09:00:33.266023', 'step': 1476, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:00:33.454539', 'step': 1476, 'epoch': 2} {'type': 'loss', 'content': 0.4258563220500946, 'timestamp': '2025-09-05 09:00:33.496351', 'step': 1477, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:33.792317', 'step': 1477, 'epoch': 2} {'type': 'loss', 'content': 0.28466764092445374, 'timestamp': '2025-09-05 09:00:33.794271', 'step': 1478, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:34.046935', 'step': 1478, 'epoch': 2} {'type': 'loss', 'content': 0.3980422914028168, 'timestamp': '2025-09-05 09:00:34.093014', 'step': 1479, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:34.304162', 'step': 1479, 'epoch': 2} {'type': 'loss', 'content': 0.36734020709991455, 'timestamp': '2025-09-05 09:00:34.321087', 'step': 1480, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:00:39.490206', 'step': 1480, 'epoch': 2} {'type': 'pplx', 'content': 57.14511724528793, 'timestamp': '2025-09-05 09:00:39.492052', 'step': 1480, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1480', 'timestamp': '2025-09-05 09:00:39.944046', 'step': 1480, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:00:40.106662', 'step': 1480, 'epoch': 2} {'type': 'loss', 'content': 0.2696632444858551, 'timestamp': '2025-09-05 09:00:40.108366', 'step': 1481, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:40.306117', 'step': 1481, 'epoch': 2} {'type': 'loss', 'content': 0.2808653712272644, 'timestamp': '2025-09-05 09:00:40.308173', 'step': 1482, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:00:40.519058', 'step': 1482, 'epoch': 2} {'type': 'loss', 'content': 0.43718430399894714, 'timestamp': '2025-09-05 09:00:40.521238', 'step': 1483, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:40.729677', 'step': 1483, 'epoch': 2} {'type': 'loss', 'content': 0.28544357419013977, 'timestamp': '2025-09-05 09:00:40.746202', 'step': 1484, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:40.945290', 'step': 1484, 'epoch': 2} {'type': 'loss', 'content': 0.18251703679561615, 'timestamp': '2025-09-05 09:00:40.947259', 'step': 1485, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:41.155284', 'step': 1485, 'epoch': 2} {'type': 'loss', 'content': 0.38339364528656006, 'timestamp': '2025-09-05 09:00:41.197437', 'step': 1486, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:41.450497', 'step': 1486, 'epoch': 2} {'type': 'loss', 'content': 0.34182044863700867, 'timestamp': '2025-09-05 09:00:41.494088', 'step': 1487, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:41.694560', 'step': 1487, 'epoch': 2} {'type': 'loss', 'content': 0.27841123938560486, 'timestamp': '2025-09-05 09:00:41.711449', 'step': 1488, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:41.917243', 'step': 1488, 'epoch': 2} {'type': 'loss', 'content': 0.342913955450058, 'timestamp': '2025-09-05 09:00:41.920331', 'step': 1489, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:42.129925', 'step': 1489, 'epoch': 2} {'type': 'loss', 'content': 0.31036046147346497, 'timestamp': '2025-09-05 09:00:42.131756', 'step': 1490, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:42.339428', 'step': 1490, 'epoch': 2} {'type': 'loss', 'content': 0.24073675274848938, 'timestamp': '2025-09-05 09:00:42.415206', 'step': 1491, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:42.628445', 'step': 1491, 'epoch': 2} {'type': 'loss', 'content': 0.43674877285957336, 'timestamp': '2025-09-05 09:00:42.644955', 'step': 1492, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:42.928380', 'step': 1492, 'epoch': 2} {'type': 'loss', 'content': 0.2842003107070923, 'timestamp': '2025-09-05 09:00:42.930866', 'step': 1493, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:43.130467', 'step': 1493, 'epoch': 2} {'type': 'loss', 'content': 0.2694554924964905, 'timestamp': '2025-09-05 09:00:43.132760', 'step': 1494, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:43.344581', 'step': 1494, 'epoch': 2} {'type': 'loss', 'content': 0.4056885540485382, 'timestamp': '2025-09-05 09:00:43.387356', 'step': 1495, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:43.636302', 'step': 1495, 'epoch': 2} {'type': 'loss', 'content': 0.4579038918018341, 'timestamp': '2025-09-05 09:00:43.652646', 'step': 1496, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:00:43.851183', 'step': 1496, 'epoch': 2} {'type': 'loss', 'content': 0.31665748357772827, 'timestamp': '2025-09-05 09:00:43.853144', 'step': 1497, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:44.062967', 'step': 1497, 'epoch': 2} {'type': 'loss', 'content': 0.28368595242500305, 'timestamp': '2025-09-05 09:00:44.065000', 'step': 1498, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:44.266543', 'step': 1498, 'epoch': 2} {'type': 'loss', 'content': 0.36625730991363525, 'timestamp': '2025-09-05 09:00:44.268982', 'step': 1499, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:44.467553', 'step': 1499, 'epoch': 2} {'type': 'loss', 'content': 0.35754650831222534, 'timestamp': '2025-09-05 09:00:44.484265', 'step': 1500, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:00:49.578608', 'step': 1500, 'epoch': 2} {'type': 'pplx', 'content': 57.19569553462994, 'timestamp': '2025-09-05 09:00:49.580840', 'step': 1500, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:49.743136', 'step': 1500, 'epoch': 2} {'type': 'loss', 'content': 0.2610255479812622, 'timestamp': '2025-09-05 09:00:49.745210', 'step': 1501, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:49.911404', 'step': 1501, 'epoch': 2} {'type': 'loss', 'content': 0.3230501115322113, 'timestamp': '2025-09-05 09:00:49.913745', 'step': 1502, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:00:50.122490', 'step': 1502, 'epoch': 2} {'type': 'loss', 'content': 0.42344826459884644, 'timestamp': '2025-09-05 09:00:50.124894', 'step': 1503, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:50.448049', 'step': 1503, 'epoch': 2} {'type': 'loss', 'content': 0.27271905541419983, 'timestamp': '2025-09-05 09:00:50.504586', 'step': 1504, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:00:50.761945', 'step': 1504, 'epoch': 2} {'type': 'loss', 'content': 0.38530251383781433, 'timestamp': '2025-09-05 09:00:50.763482', 'step': 1505, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:50.933199', 'step': 1505, 'epoch': 2} {'type': 'loss', 'content': 0.19776545464992523, 'timestamp': '2025-09-05 09:00:50.934817', 'step': 1506, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:51.142295', 'step': 1506, 'epoch': 2} {'type': 'loss', 'content': 0.4128006398677826, 'timestamp': '2025-09-05 09:00:51.144305', 'step': 1507, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:51.342140', 'step': 1507, 'epoch': 2} {'type': 'loss', 'content': 0.2871294915676117, 'timestamp': '2025-09-05 09:00:51.358189', 'step': 1508, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:51.547224', 'step': 1508, 'epoch': 2} {'type': 'loss', 'content': 0.24478839337825775, 'timestamp': '2025-09-05 09:00:51.548878', 'step': 1509, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:51.715852', 'step': 1509, 'epoch': 2} {'type': 'loss', 'content': 0.2897082269191742, 'timestamp': '2025-09-05 09:00:51.717461', 'step': 1510, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:51.887071', 'step': 1510, 'epoch': 2} {'type': 'loss', 'content': 0.2650999426841736, 'timestamp': '2025-09-05 09:00:51.888963', 'step': 1511, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:00:52.083755', 'step': 1511, 'epoch': 2} {'type': 'loss', 'content': 0.33520272374153137, 'timestamp': '2025-09-05 09:00:52.100558', 'step': 1512, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:52.297285', 'step': 1512, 'epoch': 2} {'type': 'loss', 'content': 0.2601361572742462, 'timestamp': '2025-09-05 09:00:52.298983', 'step': 1513, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:00:52.494605', 'step': 1513, 'epoch': 2} {'type': 'loss', 'content': 0.3293619453907013, 'timestamp': '2025-09-05 09:00:52.496222', 'step': 1514, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:00:52.703711', 'step': 1514, 'epoch': 2} {'type': 'loss', 'content': 0.3966652750968933, 'timestamp': '2025-09-05 09:00:52.706032', 'step': 1515, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:52.902753', 'step': 1515, 'epoch': 2} {'type': 'loss', 'content': 0.46486595273017883, 'timestamp': '2025-09-05 09:00:52.912613', 'step': 1516, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:00:53.075232', 'step': 1516, 'epoch': 2} {'type': 'loss', 'content': 0.2847346067428589, 'timestamp': '2025-09-05 09:00:53.077884', 'step': 1517, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:53.247343', 'step': 1517, 'epoch': 2} {'type': 'loss', 'content': 0.3045147657394409, 'timestamp': '2025-09-05 09:00:53.264732', 'step': 1518, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:53.472369', 'step': 1518, 'epoch': 2} {'type': 'loss', 'content': 0.34377965331077576, 'timestamp': '2025-09-05 09:00:53.474258', 'step': 1519, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:53.681046', 'step': 1519, 'epoch': 2} {'type': 'loss', 'content': 0.353103905916214, 'timestamp': '2025-09-05 09:00:53.695217', 'step': 1520, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:00:58.747097', 'step': 1520, 'epoch': 2} {'type': 'pplx', 'content': 57.199012932244166, 'timestamp': '2025-09-05 09:00:58.749409', 'step': 1520, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1520', 'timestamp': '2025-09-05 09:00:59.347636', 'step': 1520, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:00:59.530819', 'step': 1520, 'epoch': 2} {'type': 'loss', 'content': 0.2339445799589157, 'timestamp': '2025-09-05 09:00:59.532778', 'step': 1521, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:59.789659', 'step': 1521, 'epoch': 2} {'type': 'loss', 'content': 0.41792815923690796, 'timestamp': '2025-09-05 09:00:59.791267', 'step': 1522, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:00:59.997972', 'step': 1522, 'epoch': 2} {'type': 'loss', 'content': 0.26164618134498596, 'timestamp': '2025-09-05 09:01:00.001215', 'step': 1523, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:00.201106', 'step': 1523, 'epoch': 2} {'type': 'loss', 'content': 0.3471045196056366, 'timestamp': '2025-09-05 09:01:00.215901', 'step': 1524, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:00.461448', 'step': 1524, 'epoch': 2} {'type': 'loss', 'content': 0.38675960898399353, 'timestamp': '2025-09-05 09:01:00.464367', 'step': 1525, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:00.672850', 'step': 1525, 'epoch': 2} {'type': 'loss', 'content': 0.36886975169181824, 'timestamp': '2025-09-05 09:01:00.674827', 'step': 1526, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:00.875762', 'step': 1526, 'epoch': 2} {'type': 'loss', 'content': 0.2916344106197357, 'timestamp': '2025-09-05 09:01:00.877994', 'step': 1527, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:01.186868', 'step': 1527, 'epoch': 2} {'type': 'loss', 'content': 0.2596033215522766, 'timestamp': '2025-09-05 09:01:01.245041', 'step': 1528, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:01:01.553114', 'step': 1528, 'epoch': 2} {'type': 'loss', 'content': 0.41993069648742676, 'timestamp': '2025-09-05 09:01:01.554722', 'step': 1529, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:01.762589', 'step': 1529, 'epoch': 2} {'type': 'loss', 'content': 0.3928647041320801, 'timestamp': '2025-09-05 09:01:01.764333', 'step': 1530, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:01.963776', 'step': 1530, 'epoch': 2} {'type': 'loss', 'content': 0.24546408653259277, 'timestamp': '2025-09-05 09:01:01.966150', 'step': 1531, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:02.164161', 'step': 1531, 'epoch': 2} {'type': 'loss', 'content': 0.3285897970199585, 'timestamp': '2025-09-05 09:01:02.180529', 'step': 1532, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:02.380667', 'step': 1532, 'epoch': 2} {'type': 'loss', 'content': 0.23597146570682526, 'timestamp': '2025-09-05 09:01:02.382749', 'step': 1533, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:02.592689', 'step': 1533, 'epoch': 2} {'type': 'loss', 'content': 0.2703210413455963, 'timestamp': '2025-09-05 09:01:02.594777', 'step': 1534, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:02.845322', 'step': 1534, 'epoch': 2} {'type': 'loss', 'content': 0.3228624165058136, 'timestamp': '2025-09-05 09:01:02.847462', 'step': 1535, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:03.047096', 'step': 1535, 'epoch': 2} {'type': 'loss', 'content': 0.3633241355419159, 'timestamp': '2025-09-05 09:01:03.061335', 'step': 1536, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:03.258802', 'step': 1536, 'epoch': 2} {'type': 'loss', 'content': 0.3116025924682617, 'timestamp': '2025-09-05 09:01:03.261151', 'step': 1537, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:01:03.505735', 'step': 1537, 'epoch': 2} {'type': 'loss', 'content': 0.2656325101852417, 'timestamp': '2025-09-05 09:01:03.507750', 'step': 1538, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:03.714858', 'step': 1538, 'epoch': 2} {'type': 'loss', 'content': 0.502154529094696, 'timestamp': '2025-09-05 09:01:03.757830', 'step': 1539, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:04.001581', 'step': 1539, 'epoch': 2} {'type': 'loss', 'content': 0.25501522421836853, 'timestamp': '2025-09-05 09:01:04.056208', 'step': 1540, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:01:09.059460', 'step': 1540, 'epoch': 2} {'type': 'pplx', 'content': 56.22339176817989, 'timestamp': '2025-09-05 09:01:09.061549', 'step': 1540, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:09.222637', 'step': 1540, 'epoch': 2} {'type': 'loss', 'content': 0.33964210748672485, 'timestamp': '2025-09-05 09:01:09.224561', 'step': 1541, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:09.390371', 'step': 1541, 'epoch': 2} {'type': 'loss', 'content': 0.3216552436351776, 'timestamp': '2025-09-05 09:01:09.392053', 'step': 1542, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:09.609437', 'step': 1542, 'epoch': 2} {'type': 'loss', 'content': 0.46677759289741516, 'timestamp': '2025-09-05 09:01:09.611720', 'step': 1543, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:09.820890', 'step': 1543, 'epoch': 2} {'type': 'loss', 'content': 0.4672287404537201, 'timestamp': '2025-09-05 09:01:09.835069', 'step': 1544, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:10.024285', 'step': 1544, 'epoch': 2} {'type': 'loss', 'content': 0.31345078349113464, 'timestamp': '2025-09-05 09:01:10.026199', 'step': 1545, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:10.235065', 'step': 1545, 'epoch': 2} {'type': 'loss', 'content': 0.25660619139671326, 'timestamp': '2025-09-05 09:01:10.237026', 'step': 1546, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:10.441232', 'step': 1546, 'epoch': 2} {'type': 'loss', 'content': 0.2932271957397461, 'timestamp': '2025-09-05 09:01:10.443096', 'step': 1547, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:10.609307', 'step': 1547, 'epoch': 2} {'type': 'loss', 'content': 0.28923869132995605, 'timestamp': '2025-09-05 09:01:10.626261', 'step': 1548, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:10.873301', 'step': 1548, 'epoch': 2} {'type': 'loss', 'content': 0.2676112949848175, 'timestamp': '2025-09-05 09:01:10.875539', 'step': 1549, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:11.121598', 'step': 1549, 'epoch': 2} {'type': 'loss', 'content': 0.24210013449192047, 'timestamp': '2025-09-05 09:01:11.123753', 'step': 1550, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:11.320871', 'step': 1550, 'epoch': 2} {'type': 'loss', 'content': 0.31983768939971924, 'timestamp': '2025-09-05 09:01:11.322439', 'step': 1551, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:01:11.529052', 'step': 1551, 'epoch': 2} {'type': 'loss', 'content': 0.2809242904186249, 'timestamp': '2025-09-05 09:01:11.543506', 'step': 1552, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:11.788938', 'step': 1552, 'epoch': 2} {'type': 'loss', 'content': 0.3017968237400055, 'timestamp': '2025-09-05 09:01:11.791257', 'step': 1553, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:12.041437', 'step': 1553, 'epoch': 2} {'type': 'loss', 'content': 0.32433098554611206, 'timestamp': '2025-09-05 09:01:12.043576', 'step': 1554, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:12.248072', 'step': 1554, 'epoch': 2} {'type': 'loss', 'content': 0.5199801921844482, 'timestamp': '2025-09-05 09:01:12.249996', 'step': 1555, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:01:12.416761', 'step': 1555, 'epoch': 2} {'type': 'loss', 'content': 0.303520530462265, 'timestamp': '2025-09-05 09:01:12.476023', 'step': 1556, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:12.719674', 'step': 1556, 'epoch': 2} {'type': 'loss', 'content': 0.4727158844470978, 'timestamp': '2025-09-05 09:01:12.721369', 'step': 1557, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:12.920588', 'step': 1557, 'epoch': 2} {'type': 'loss', 'content': 0.3274668753147125, 'timestamp': '2025-09-05 09:01:12.922360', 'step': 1558, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:13.120064', 'step': 1558, 'epoch': 2} {'type': 'loss', 'content': 0.35360151529312134, 'timestamp': '2025-09-05 09:01:13.121761', 'step': 1559, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:01:13.304478', 'step': 1559, 'epoch': 2} {'type': 'loss', 'content': 0.4361201524734497, 'timestamp': '2025-09-05 09:01:13.313794', 'step': 1560, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:01:18.338585', 'step': 1560, 'epoch': 2} {'type': 'pplx', 'content': 56.41456573621123, 'timestamp': '2025-09-05 09:01:18.350072', 'step': 1560, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1560', 'timestamp': '2025-09-05 09:01:18.864357', 'step': 1560, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:19.052137', 'step': 1560, 'epoch': 2} {'type': 'loss', 'content': 0.3437816798686981, 'timestamp': '2025-09-05 09:01:19.086911', 'step': 1561, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:19.346975', 'step': 1561, 'epoch': 2} {'type': 'loss', 'content': 0.3193230926990509, 'timestamp': '2025-09-05 09:01:19.348708', 'step': 1562, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:19.554605', 'step': 1562, 'epoch': 2} {'type': 'loss', 'content': 0.3251970708370209, 'timestamp': '2025-09-05 09:01:19.560596', 'step': 1563, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:19.760161', 'step': 1563, 'epoch': 2} {'type': 'loss', 'content': 0.3838941156864166, 'timestamp': '2025-09-05 09:01:19.776545', 'step': 1564, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:19.974450', 'step': 1564, 'epoch': 2} {'type': 'loss', 'content': 0.3207242488861084, 'timestamp': '2025-09-05 09:01:19.976164', 'step': 1565, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:20.242298', 'step': 1565, 'epoch': 2} {'type': 'loss', 'content': 0.3163876235485077, 'timestamp': '2025-09-05 09:01:20.244368', 'step': 1566, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:20.440224', 'step': 1566, 'epoch': 2} {'type': 'loss', 'content': 0.2813356816768646, 'timestamp': '2025-09-05 09:01:20.442252', 'step': 1567, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:20.647109', 'step': 1567, 'epoch': 2} {'type': 'loss', 'content': 0.19117353856563568, 'timestamp': '2025-09-05 09:01:20.661404', 'step': 1568, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:20.888763', 'step': 1568, 'epoch': 2} {'type': 'loss', 'content': 0.34964126348495483, 'timestamp': '2025-09-05 09:01:20.890577', 'step': 1569, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:21.138262', 'step': 1569, 'epoch': 2} {'type': 'loss', 'content': 0.3004063367843628, 'timestamp': '2025-09-05 09:01:21.141055', 'step': 1570, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:21.335800', 'step': 1570, 'epoch': 2} {'type': 'loss', 'content': 0.3171744644641876, 'timestamp': '2025-09-05 09:01:21.339587', 'step': 1571, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:21.533662', 'step': 1571, 'epoch': 2} {'type': 'loss', 'content': 0.47390928864479065, 'timestamp': '2025-09-05 09:01:21.551024', 'step': 1572, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:21.746161', 'step': 1572, 'epoch': 2} {'type': 'loss', 'content': 0.34400877356529236, 'timestamp': '2025-09-05 09:01:21.749209', 'step': 1573, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:21.914493', 'step': 1573, 'epoch': 2} {'type': 'loss', 'content': 0.3936767876148224, 'timestamp': '2025-09-05 09:01:21.917943', 'step': 1574, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:22.122702', 'step': 1574, 'epoch': 2} {'type': 'loss', 'content': 0.2832147181034088, 'timestamp': '2025-09-05 09:01:22.125039', 'step': 1575, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:22.291669', 'step': 1575, 'epoch': 2} {'type': 'loss', 'content': 0.2926376760005951, 'timestamp': '2025-09-05 09:01:22.309042', 'step': 1576, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:22.551858', 'step': 1576, 'epoch': 2} {'type': 'loss', 'content': 0.4421359896659851, 'timestamp': '2025-09-05 09:01:22.555003', 'step': 1577, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:22.749743', 'step': 1577, 'epoch': 2} {'type': 'loss', 'content': 0.3380582928657532, 'timestamp': '2025-09-05 09:01:22.752462', 'step': 1578, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:22.949713', 'step': 1578, 'epoch': 2} {'type': 'loss', 'content': 0.23209118843078613, 'timestamp': '2025-09-05 09:01:22.952426', 'step': 1579, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:23.244611', 'step': 1579, 'epoch': 2} {'type': 'loss', 'content': 0.29452741146087646, 'timestamp': '2025-09-05 09:01:23.260250', 'step': 1580, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:01:28.387804', 'step': 1580, 'epoch': 2} {'type': 'pplx', 'content': 56.31358010394533, 'timestamp': '2025-09-05 09:01:28.390146', 'step': 1580, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:28.552853', 'step': 1580, 'epoch': 2} {'type': 'loss', 'content': 0.22305051982402802, 'timestamp': '2025-09-05 09:01:28.555962', 'step': 1581, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:28.762299', 'step': 1581, 'epoch': 2} {'type': 'loss', 'content': 0.4951125383377075, 'timestamp': '2025-09-05 09:01:28.765755', 'step': 1582, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:28.965837', 'step': 1582, 'epoch': 2} {'type': 'loss', 'content': 0.2661086320877075, 'timestamp': '2025-09-05 09:01:28.967787', 'step': 1583, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:29.150701', 'step': 1583, 'epoch': 2} {'type': 'loss', 'content': 0.3857158124446869, 'timestamp': '2025-09-05 09:01:29.160993', 'step': 1584, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:29.324956', 'step': 1584, 'epoch': 2} {'type': 'loss', 'content': 0.32872211933135986, 'timestamp': '2025-09-05 09:01:29.326514', 'step': 1585, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:01:29.531376', 'step': 1585, 'epoch': 2} {'type': 'loss', 'content': 0.3093208074569702, 'timestamp': '2025-09-05 09:01:29.533473', 'step': 1586, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:29.733192', 'step': 1586, 'epoch': 2} {'type': 'loss', 'content': 0.29720935225486755, 'timestamp': '2025-09-05 09:01:29.735435', 'step': 1587, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:29.930239', 'step': 1587, 'epoch': 2} {'type': 'loss', 'content': 0.3052479922771454, 'timestamp': '2025-09-05 09:01:29.946354', 'step': 1588, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:01:30.138120', 'step': 1588, 'epoch': 2} {'type': 'loss', 'content': 0.3246830105781555, 'timestamp': '2025-09-05 09:01:30.140093', 'step': 1589, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:30.336315', 'step': 1589, 'epoch': 2} {'type': 'loss', 'content': 0.4002441465854645, 'timestamp': '2025-09-05 09:01:30.338455', 'step': 1590, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:30.534261', 'step': 1590, 'epoch': 2} {'type': 'loss', 'content': 0.34371864795684814, 'timestamp': '2025-09-05 09:01:30.577760', 'step': 1591, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:30.775757', 'step': 1591, 'epoch': 2} {'type': 'loss', 'content': 0.30698439478874207, 'timestamp': '2025-09-05 09:01:30.790514', 'step': 1592, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:01:30.980985', 'step': 1592, 'epoch': 2} {'type': 'loss', 'content': 0.4689514636993408, 'timestamp': '2025-09-05 09:01:30.982847', 'step': 1593, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:31.187822', 'step': 1593, 'epoch': 2} {'type': 'loss', 'content': 0.3119965195655823, 'timestamp': '2025-09-05 09:01:31.189428', 'step': 1594, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:31.384273', 'step': 1594, 'epoch': 2} {'type': 'loss', 'content': 0.4260518550872803, 'timestamp': '2025-09-05 09:01:31.386455', 'step': 1595, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:31.584055', 'step': 1595, 'epoch': 2} {'type': 'loss', 'content': 0.42162322998046875, 'timestamp': '2025-09-05 09:01:31.598523', 'step': 1596, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:31.842004', 'step': 1596, 'epoch': 2} {'type': 'loss', 'content': 0.3950115144252777, 'timestamp': '2025-09-05 09:01:31.844535', 'step': 1597, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:32.042926', 'step': 1597, 'epoch': 2} {'type': 'loss', 'content': 0.3251371681690216, 'timestamp': '2025-09-05 09:01:32.045798', 'step': 1598, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:32.242051', 'step': 1598, 'epoch': 2} {'type': 'loss', 'content': 0.4372723698616028, 'timestamp': '2025-09-05 09:01:32.245518', 'step': 1599, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:32.442582', 'step': 1599, 'epoch': 2} {'type': 'loss', 'content': 0.2939607799053192, 'timestamp': '2025-09-05 09:01:32.459198', 'step': 1600, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:01:37.613280', 'step': 1600, 'epoch': 2} {'type': 'pplx', 'content': 56.03260913326416, 'timestamp': '2025-09-05 09:01:37.615081', 'step': 1600, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1600', 'timestamp': '2025-09-05 09:01:38.059228', 'step': 1600, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:01:38.220521', 'step': 1600, 'epoch': 2} {'type': 'loss', 'content': 0.4079391062259674, 'timestamp': '2025-09-05 09:01:38.222270', 'step': 1601, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:38.426775', 'step': 1601, 'epoch': 2} {'type': 'loss', 'content': 0.26604366302490234, 'timestamp': '2025-09-05 09:01:38.428405', 'step': 1602, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:38.632567', 'step': 1602, 'epoch': 2} {'type': 'loss', 'content': 0.270683616399765, 'timestamp': '2025-09-05 09:01:38.634094', 'step': 1603, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:01:38.832221', 'step': 1603, 'epoch': 2} {'type': 'loss', 'content': 0.3520897924900055, 'timestamp': '2025-09-05 09:01:38.846486', 'step': 1604, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:39.035039', 'step': 1604, 'epoch': 2} {'type': 'loss', 'content': 0.4065357744693756, 'timestamp': '2025-09-05 09:01:39.037089', 'step': 1605, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:01:39.231198', 'step': 1605, 'epoch': 2} {'type': 'loss', 'content': 0.4396074414253235, 'timestamp': '2025-09-05 09:01:39.233010', 'step': 1606, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:39.432802', 'step': 1606, 'epoch': 2} {'type': 'loss', 'content': 0.19596950709819794, 'timestamp': '2025-09-05 09:01:39.434547', 'step': 1607, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:39.683964', 'step': 1607, 'epoch': 2} {'type': 'loss', 'content': 0.34500351548194885, 'timestamp': '2025-09-05 09:01:39.698962', 'step': 1608, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:39.890897', 'step': 1608, 'epoch': 2} {'type': 'loss', 'content': 0.29165971279144287, 'timestamp': '2025-09-05 09:01:39.892845', 'step': 1609, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:40.091985', 'step': 1609, 'epoch': 2} {'type': 'loss', 'content': 0.41614753007888794, 'timestamp': '2025-09-05 09:01:40.094774', 'step': 1610, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:40.345641', 'step': 1610, 'epoch': 2} {'type': 'loss', 'content': 0.35462021827697754, 'timestamp': '2025-09-05 09:01:40.349976', 'step': 1611, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:01:40.611172', 'step': 1611, 'epoch': 2} {'type': 'loss', 'content': 0.440545916557312, 'timestamp': '2025-09-05 09:01:40.626023', 'step': 1612, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:01:40.824860', 'step': 1612, 'epoch': 2} {'type': 'loss', 'content': 0.30644723773002625, 'timestamp': '2025-09-05 09:01:40.827413', 'step': 1613, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:41.078955', 'step': 1613, 'epoch': 2} {'type': 'loss', 'content': 0.31383422017097473, 'timestamp': '2025-09-05 09:01:41.081534', 'step': 1614, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:41.278142', 'step': 1614, 'epoch': 2} {'type': 'loss', 'content': 0.4448195695877075, 'timestamp': '2025-09-05 09:01:41.280472', 'step': 1615, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:41.473811', 'step': 1615, 'epoch': 2} {'type': 'loss', 'content': 0.2656569480895996, 'timestamp': '2025-09-05 09:01:41.488401', 'step': 1616, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:01:41.678286', 'step': 1616, 'epoch': 2} {'type': 'loss', 'content': 0.27651652693748474, 'timestamp': '2025-09-05 09:01:41.680091', 'step': 1617, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:41.929276', 'step': 1617, 'epoch': 2} {'type': 'loss', 'content': 0.22921140491962433, 'timestamp': '2025-09-05 09:01:41.931200', 'step': 1618, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:42.128588', 'step': 1618, 'epoch': 2} {'type': 'loss', 'content': 0.24880188703536987, 'timestamp': '2025-09-05 09:01:42.130379', 'step': 1619, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:42.339134', 'step': 1619, 'epoch': 2} {'type': 'loss', 'content': 0.27319225668907166, 'timestamp': '2025-09-05 09:01:42.395861', 'step': 1620, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:01:47.860884', 'step': 1620, 'epoch': 2} {'type': 'pplx', 'content': 55.03103150706018, 'timestamp': '2025-09-05 09:01:47.862612', 'step': 1620, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:48.026237', 'step': 1620, 'epoch': 2} {'type': 'loss', 'content': 0.2473212331533432, 'timestamp': '2025-09-05 09:01:48.069969', 'step': 1621, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:48.239318', 'step': 1621, 'epoch': 2} {'type': 'loss', 'content': 0.30699431896209717, 'timestamp': '2025-09-05 09:01:48.241670', 'step': 1622, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:48.447505', 'step': 1622, 'epoch': 2} {'type': 'loss', 'content': 0.47052350640296936, 'timestamp': '2025-09-05 09:01:48.449557', 'step': 1623, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:48.656637', 'step': 1623, 'epoch': 2} {'type': 'loss', 'content': 0.3628225028514862, 'timestamp': '2025-09-05 09:01:48.714869', 'step': 1624, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:48.914680', 'step': 1624, 'epoch': 2} {'type': 'loss', 'content': 0.26334837079048157, 'timestamp': '2025-09-05 09:01:48.916727', 'step': 1625, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:49.115550', 'step': 1625, 'epoch': 2} {'type': 'loss', 'content': 0.37516269087791443, 'timestamp': '2025-09-05 09:01:49.117110', 'step': 1626, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 09:01:49.315505', 'step': 1626, 'epoch': 2} {'type': 'loss', 'content': 0.5237755179405212, 'timestamp': '2025-09-05 09:01:49.317923', 'step': 1627, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:49.526667', 'step': 1627, 'epoch': 2} {'type': 'loss', 'content': 0.20416662096977234, 'timestamp': '2025-09-05 09:01:49.540424', 'step': 1628, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:01:49.731734', 'step': 1628, 'epoch': 2} {'type': 'loss', 'content': 0.3799273371696472, 'timestamp': '2025-09-05 09:01:49.733539', 'step': 1629, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:49.928401', 'step': 1629, 'epoch': 2} {'type': 'loss', 'content': 0.4185466170310974, 'timestamp': '2025-09-05 09:01:49.930768', 'step': 1630, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:50.137540', 'step': 1630, 'epoch': 2} {'type': 'loss', 'content': 0.28220972418785095, 'timestamp': '2025-09-05 09:01:50.140239', 'step': 1631, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:50.348233', 'step': 1631, 'epoch': 2} {'type': 'loss', 'content': 0.2010842263698578, 'timestamp': '2025-09-05 09:01:50.363989', 'step': 1632, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:50.607652', 'step': 1632, 'epoch': 2} {'type': 'loss', 'content': 0.29958879947662354, 'timestamp': '2025-09-05 09:01:50.610206', 'step': 1633, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:50.808964', 'step': 1633, 'epoch': 2} {'type': 'loss', 'content': 0.33167004585266113, 'timestamp': '2025-09-05 09:01:50.811115', 'step': 1634, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:51.009267', 'step': 1634, 'epoch': 2} {'type': 'loss', 'content': 0.24998310208320618, 'timestamp': '2025-09-05 09:01:51.031265', 'step': 1635, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:51.282600', 'step': 1635, 'epoch': 2} {'type': 'loss', 'content': 0.311273455619812, 'timestamp': '2025-09-05 09:01:51.297279', 'step': 1636, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:51.488464', 'step': 1636, 'epoch': 2} {'type': 'loss', 'content': 0.28948909044265747, 'timestamp': '2025-09-05 09:01:51.490652', 'step': 1637, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:01:51.746233', 'step': 1637, 'epoch': 2} {'type': 'loss', 'content': 0.2414962649345398, 'timestamp': '2025-09-05 09:01:51.748103', 'step': 1638, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:51.945697', 'step': 1638, 'epoch': 2} {'type': 'loss', 'content': 0.26412102580070496, 'timestamp': '2025-09-05 09:01:51.947451', 'step': 1639, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:52.142544', 'step': 1639, 'epoch': 2} {'type': 'loss', 'content': 0.27432331442832947, 'timestamp': '2025-09-05 09:01:52.160428', 'step': 1640, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:01:56.942968', 'step': 1640, 'epoch': 2} {'type': 'pplx', 'content': 54.70964771417281, 'timestamp': '2025-09-05 09:01:56.947175', 'step': 1640, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1640', 'timestamp': '2025-09-05 09:01:57.598988', 'step': 1640, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:57.841522', 'step': 1640, 'epoch': 2} {'type': 'loss', 'content': 0.32458359003067017, 'timestamp': '2025-09-05 09:01:57.843776', 'step': 1641, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:01:58.040943', 'step': 1641, 'epoch': 2} {'type': 'loss', 'content': 0.4251089096069336, 'timestamp': '2025-09-05 09:01:58.042669', 'step': 1642, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:58.240063', 'step': 1642, 'epoch': 2} {'type': 'loss', 'content': 0.4108656942844391, 'timestamp': '2025-09-05 09:01:58.241816', 'step': 1643, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:58.437720', 'step': 1643, 'epoch': 2} {'type': 'loss', 'content': 0.38851895928382874, 'timestamp': '2025-09-05 09:01:58.454504', 'step': 1644, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:58.651986', 'step': 1644, 'epoch': 2} {'type': 'loss', 'content': 0.2178976833820343, 'timestamp': '2025-09-05 09:01:58.654067', 'step': 1645, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:01:58.822924', 'step': 1645, 'epoch': 2} {'type': 'loss', 'content': 0.35721564292907715, 'timestamp': '2025-09-05 09:01:58.825316', 'step': 1646, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:01:59.044925', 'step': 1646, 'epoch': 2} {'type': 'loss', 'content': 0.17846766114234924, 'timestamp': '2025-09-05 09:01:59.047352', 'step': 1647, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:59.215050', 'step': 1647, 'epoch': 2} {'type': 'loss', 'content': 0.38131412863731384, 'timestamp': '2025-09-05 09:01:59.231949', 'step': 1648, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:01:59.428199', 'step': 1648, 'epoch': 2} {'type': 'loss', 'content': 0.16115841269493103, 'timestamp': '2025-09-05 09:01:59.430055', 'step': 1649, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:01:59.636473', 'step': 1649, 'epoch': 2} {'type': 'loss', 'content': 0.45021679997444153, 'timestamp': '2025-09-05 09:01:59.638966', 'step': 1650, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:01:59.848917', 'step': 1650, 'epoch': 2} {'type': 'loss', 'content': 0.3558279871940613, 'timestamp': '2025-09-05 09:01:59.851023', 'step': 1651, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:00.096329', 'step': 1651, 'epoch': 2} {'type': 'loss', 'content': 0.40276896953582764, 'timestamp': '2025-09-05 09:02:00.113286', 'step': 1652, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:00.312785', 'step': 1652, 'epoch': 2} {'type': 'loss', 'content': 0.29273685812950134, 'timestamp': '2025-09-05 09:02:00.314731', 'step': 1653, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:00.563338', 'step': 1653, 'epoch': 2} {'type': 'loss', 'content': 0.4775174558162689, 'timestamp': '2025-09-05 09:02:00.565910', 'step': 1654, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:00.763870', 'step': 1654, 'epoch': 2} {'type': 'loss', 'content': 0.3428560793399811, 'timestamp': '2025-09-05 09:02:00.767063', 'step': 1655, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:01.016867', 'step': 1655, 'epoch': 2} {'type': 'loss', 'content': 0.3681947588920593, 'timestamp': '2025-09-05 09:02:01.031643', 'step': 1656, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:01.218711', 'step': 1656, 'epoch': 2} {'type': 'loss', 'content': 0.4562534689903259, 'timestamp': '2025-09-05 09:02:01.220599', 'step': 1657, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:01.606943', 'step': 1657, 'epoch': 2} {'type': 'loss', 'content': 0.38250502943992615, 'timestamp': '2025-09-05 09:02:01.609870', 'step': 1658, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:01.806481', 'step': 1658, 'epoch': 2} {'type': 'loss', 'content': 0.4949440658092499, 'timestamp': '2025-09-05 09:02:01.809636', 'step': 1659, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:02.006945', 'step': 1659, 'epoch': 2} {'type': 'loss', 'content': 0.30622151494026184, 'timestamp': '2025-09-05 09:02:02.022446', 'step': 1660, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:02:07.306404', 'step': 1660, 'epoch': 2} {'type': 'pplx', 'content': 55.28318964602614, 'timestamp': '2025-09-05 09:02:07.308770', 'step': 1660, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:07.471588', 'step': 1660, 'epoch': 2} {'type': 'loss', 'content': 0.41278305649757385, 'timestamp': '2025-09-05 09:02:07.473751', 'step': 1661, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:07.641861', 'step': 1661, 'epoch': 2} {'type': 'loss', 'content': 0.2784328758716583, 'timestamp': '2025-09-05 09:02:07.644612', 'step': 1662, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:07.853696', 'step': 1662, 'epoch': 2} {'type': 'loss', 'content': 0.3032529056072235, 'timestamp': '2025-09-05 09:02:07.855716', 'step': 1663, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:08.056549', 'step': 1663, 'epoch': 2} {'type': 'loss', 'content': 0.3818753659725189, 'timestamp': '2025-09-05 09:02:08.071192', 'step': 1664, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:08.260992', 'step': 1664, 'epoch': 2} {'type': 'loss', 'content': 0.49340173602104187, 'timestamp': '2025-09-05 09:02:08.263931', 'step': 1665, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:08.471439', 'step': 1665, 'epoch': 2} {'type': 'loss', 'content': 0.30880993604660034, 'timestamp': '2025-09-05 09:02:08.474238', 'step': 1666, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:08.681287', 'step': 1666, 'epoch': 2} {'type': 'loss', 'content': 0.34679117798805237, 'timestamp': '2025-09-05 09:02:08.683188', 'step': 1667, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:08.890210', 'step': 1667, 'epoch': 2} {'type': 'loss', 'content': 0.2942996919155121, 'timestamp': '2025-09-05 09:02:08.905288', 'step': 1668, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:09.094441', 'step': 1668, 'epoch': 2} {'type': 'loss', 'content': 0.43989962339401245, 'timestamp': '2025-09-05 09:02:09.096279', 'step': 1669, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:09.302837', 'step': 1669, 'epoch': 2} {'type': 'loss', 'content': 0.30262741446495056, 'timestamp': '2025-09-05 09:02:09.310213', 'step': 1670, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:02:09.509474', 'step': 1670, 'epoch': 2} {'type': 'loss', 'content': 0.2830575406551361, 'timestamp': '2025-09-05 09:02:09.511875', 'step': 1671, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:09.708618', 'step': 1671, 'epoch': 2} {'type': 'loss', 'content': 0.23973654210567474, 'timestamp': '2025-09-05 09:02:09.723564', 'step': 1672, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:09.912351', 'step': 1672, 'epoch': 2} {'type': 'loss', 'content': 0.4138979911804199, 'timestamp': '2025-09-05 09:02:09.914678', 'step': 1673, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:10.120830', 'step': 1673, 'epoch': 2} {'type': 'loss', 'content': 0.33984220027923584, 'timestamp': '2025-09-05 09:02:10.122661', 'step': 1674, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:10.330661', 'step': 1674, 'epoch': 2} {'type': 'loss', 'content': 0.4164768159389496, 'timestamp': '2025-09-05 09:02:10.332757', 'step': 1675, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:10.528555', 'step': 1675, 'epoch': 2} {'type': 'loss', 'content': 0.38378530740737915, 'timestamp': '2025-09-05 09:02:10.545099', 'step': 1676, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:10.752714', 'step': 1676, 'epoch': 2} {'type': 'loss', 'content': 0.4028756320476532, 'timestamp': '2025-09-05 09:02:10.755044', 'step': 1677, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:10.948364', 'step': 1677, 'epoch': 2} {'type': 'loss', 'content': 0.31969425082206726, 'timestamp': '2025-09-05 09:02:10.950693', 'step': 1678, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:11.157995', 'step': 1678, 'epoch': 2} {'type': 'loss', 'content': 0.41612645983695984, 'timestamp': '2025-09-05 09:02:11.160072', 'step': 1679, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:11.366276', 'step': 1679, 'epoch': 2} {'type': 'loss', 'content': 0.3250633478164673, 'timestamp': '2025-09-05 09:02:11.381192', 'step': 1680, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:02:16.366281', 'step': 1680, 'epoch': 2} {'type': 'pplx', 'content': 54.385877678644015, 'timestamp': '2025-09-05 09:02:16.371386', 'step': 1680, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1680', 'timestamp': '2025-09-05 09:02:16.822315', 'step': 1680, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:17.010071', 'step': 1680, 'epoch': 2} {'type': 'loss', 'content': 0.22635141015052795, 'timestamp': '2025-09-05 09:02:17.013493', 'step': 1681, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:17.221771', 'step': 1681, 'epoch': 2} {'type': 'loss', 'content': 0.3104085326194763, 'timestamp': '2025-09-05 09:02:17.223669', 'step': 1682, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:17.422856', 'step': 1682, 'epoch': 2} {'type': 'loss', 'content': 0.3125440776348114, 'timestamp': '2025-09-05 09:02:17.424683', 'step': 1683, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:17.590705', 'step': 1683, 'epoch': 2} {'type': 'loss', 'content': 0.28808438777923584, 'timestamp': '2025-09-05 09:02:17.624104', 'step': 1684, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:17.822056', 'step': 1684, 'epoch': 2} {'type': 'loss', 'content': 0.3280973434448242, 'timestamp': '2025-09-05 09:02:17.824459', 'step': 1685, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:18.074431', 'step': 1685, 'epoch': 2} {'type': 'loss', 'content': 0.4642024636268616, 'timestamp': '2025-09-05 09:02:18.076468', 'step': 1686, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:18.245941', 'step': 1686, 'epoch': 2} {'type': 'loss', 'content': 0.4451746940612793, 'timestamp': '2025-09-05 09:02:18.248641', 'step': 1687, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:18.445851', 'step': 1687, 'epoch': 2} {'type': 'loss', 'content': 0.4639004170894623, 'timestamp': '2025-09-05 09:02:18.460296', 'step': 1688, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:18.647761', 'step': 1688, 'epoch': 2} {'type': 'loss', 'content': 0.44137394428253174, 'timestamp': '2025-09-05 09:02:18.649593', 'step': 1689, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:18.855698', 'step': 1689, 'epoch': 2} {'type': 'loss', 'content': 0.3657119870185852, 'timestamp': '2025-09-05 09:02:18.857681', 'step': 1690, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:19.054479', 'step': 1690, 'epoch': 2} {'type': 'loss', 'content': 0.38898834586143494, 'timestamp': '2025-09-05 09:02:19.056545', 'step': 1691, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:19.252320', 'step': 1691, 'epoch': 2} {'type': 'loss', 'content': 0.3620879352092743, 'timestamp': '2025-09-05 09:02:19.266955', 'step': 1692, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:19.456051', 'step': 1692, 'epoch': 2} {'type': 'loss', 'content': 0.3081320822238922, 'timestamp': '2025-09-05 09:02:19.457978', 'step': 1693, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:19.625303', 'step': 1693, 'epoch': 2} {'type': 'loss', 'content': 0.4219552278518677, 'timestamp': '2025-09-05 09:02:19.627423', 'step': 1694, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:02:19.832534', 'step': 1694, 'epoch': 2} {'type': 'loss', 'content': 0.31051498651504517, 'timestamp': '2025-09-05 09:02:19.834368', 'step': 1695, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:20.039831', 'step': 1695, 'epoch': 2} {'type': 'loss', 'content': 0.18346603214740753, 'timestamp': '2025-09-05 09:02:20.054596', 'step': 1696, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:20.251930', 'step': 1696, 'epoch': 2} {'type': 'loss', 'content': 0.2537613809108734, 'timestamp': '2025-09-05 09:02:20.253750', 'step': 1697, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:20.450601', 'step': 1697, 'epoch': 2} {'type': 'loss', 'content': 0.4183553457260132, 'timestamp': '2025-09-05 09:02:20.452396', 'step': 1698, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:20.659171', 'step': 1698, 'epoch': 2} {'type': 'loss', 'content': 0.3628275990486145, 'timestamp': '2025-09-05 09:02:20.661000', 'step': 1699, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:20.856750', 'step': 1699, 'epoch': 2} {'type': 'loss', 'content': 0.3655959367752075, 'timestamp': '2025-09-05 09:02:20.871615', 'step': 1700, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:02:25.746505', 'step': 1700, 'epoch': 2} {'type': 'pplx', 'content': 52.8854128909846, 'timestamp': '2025-09-05 09:02:25.748403', 'step': 1700, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:25.911309', 'step': 1700, 'epoch': 2} {'type': 'loss', 'content': 0.5494905710220337, 'timestamp': '2025-09-05 09:02:25.913231', 'step': 1701, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:26.118423', 'step': 1701, 'epoch': 2} {'type': 'loss', 'content': 0.24261198937892914, 'timestamp': '2025-09-05 09:02:26.121092', 'step': 1702, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:26.317960', 'step': 1702, 'epoch': 2} {'type': 'loss', 'content': 0.3774292469024658, 'timestamp': '2025-09-05 09:02:26.320170', 'step': 1703, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:26.488714', 'step': 1703, 'epoch': 2} {'type': 'loss', 'content': 0.3856517970561981, 'timestamp': '2025-09-05 09:02:26.506248', 'step': 1704, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:26.695169', 'step': 1704, 'epoch': 2} {'type': 'loss', 'content': 0.31671613454818726, 'timestamp': '2025-09-05 09:02:26.698931', 'step': 1705, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:02:26.947442', 'step': 1705, 'epoch': 2} {'type': 'loss', 'content': 0.33507075905799866, 'timestamp': '2025-09-05 09:02:26.980111', 'step': 1706, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:27.187298', 'step': 1706, 'epoch': 2} {'type': 'loss', 'content': 0.32331711053848267, 'timestamp': '2025-09-05 09:02:27.189573', 'step': 1707, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:27.355286', 'step': 1707, 'epoch': 2} {'type': 'loss', 'content': 0.34161141514778137, 'timestamp': '2025-09-05 09:02:27.372454', 'step': 1708, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:27.568904', 'step': 1708, 'epoch': 2} {'type': 'loss', 'content': 0.29065045714378357, 'timestamp': '2025-09-05 09:02:27.570686', 'step': 1709, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:27.778591', 'step': 1709, 'epoch': 2} {'type': 'loss', 'content': 0.22849217057228088, 'timestamp': '2025-09-05 09:02:27.780580', 'step': 1710, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:27.947810', 'step': 1710, 'epoch': 2} {'type': 'loss', 'content': 0.3984873592853546, 'timestamp': '2025-09-05 09:02:27.949766', 'step': 1711, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:28.154332', 'step': 1711, 'epoch': 2} {'type': 'loss', 'content': 0.26712897419929504, 'timestamp': '2025-09-05 09:02:28.168755', 'step': 1712, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:28.358064', 'step': 1712, 'epoch': 2} {'type': 'loss', 'content': 0.33036503195762634, 'timestamp': '2025-09-05 09:02:28.359870', 'step': 1713, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:28.528281', 'step': 1713, 'epoch': 2} {'type': 'loss', 'content': 0.40827813744544983, 'timestamp': '2025-09-05 09:02:28.531089', 'step': 1714, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:28.785579', 'step': 1714, 'epoch': 2} {'type': 'loss', 'content': 0.30413565039634705, 'timestamp': '2025-09-05 09:02:28.787924', 'step': 1715, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:29.125532', 'step': 1715, 'epoch': 2} {'type': 'loss', 'content': 0.33635202050209045, 'timestamp': '2025-09-05 09:02:29.141494', 'step': 1716, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:02:29.384162', 'step': 1716, 'epoch': 2} {'type': 'loss', 'content': 0.30479493737220764, 'timestamp': '2025-09-05 09:02:29.385834', 'step': 1717, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:29.580567', 'step': 1717, 'epoch': 2} {'type': 'loss', 'content': 0.25971081852912903, 'timestamp': '2025-09-05 09:02:29.582717', 'step': 1718, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:29.788226', 'step': 1718, 'epoch': 2} {'type': 'loss', 'content': 0.33488330245018005, 'timestamp': '2025-09-05 09:02:29.790269', 'step': 1719, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:30.038248', 'step': 1719, 'epoch': 2} {'type': 'loss', 'content': 0.2982542812824249, 'timestamp': '2025-09-05 09:02:30.053364', 'step': 1720, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:02:36.028498', 'step': 1720, 'epoch': 2} {'type': 'pplx', 'content': 52.41845639341919, 'timestamp': '2025-09-05 09:02:36.031743', 'step': 1720, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1720', 'timestamp': '2025-09-05 09:02:36.853432', 'step': 1720, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:37.032163', 'step': 1720, 'epoch': 2} {'type': 'loss', 'content': 0.2861115038394928, 'timestamp': '2025-09-05 09:02:37.034925', 'step': 1721, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:37.234316', 'step': 1721, 'epoch': 2} {'type': 'loss', 'content': 0.22263473272323608, 'timestamp': '2025-09-05 09:02:37.236395', 'step': 1722, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:37.435471', 'step': 1722, 'epoch': 2} {'type': 'loss', 'content': 0.2188272327184677, 'timestamp': '2025-09-05 09:02:37.437670', 'step': 1723, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:37.636131', 'step': 1723, 'epoch': 2} {'type': 'loss', 'content': 0.3807090222835541, 'timestamp': '2025-09-05 09:02:37.651194', 'step': 1724, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:37.842885', 'step': 1724, 'epoch': 2} {'type': 'loss', 'content': 0.3141788840293884, 'timestamp': '2025-09-05 09:02:37.844950', 'step': 1725, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:38.041711', 'step': 1725, 'epoch': 2} {'type': 'loss', 'content': 0.28930506110191345, 'timestamp': '2025-09-05 09:02:38.043434', 'step': 1726, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:38.240729', 'step': 1726, 'epoch': 2} {'type': 'loss', 'content': 0.3060302734375, 'timestamp': '2025-09-05 09:02:38.242833', 'step': 1727, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:02:38.442684', 'step': 1727, 'epoch': 2} {'type': 'loss', 'content': 0.3646160662174225, 'timestamp': '2025-09-05 09:02:38.459198', 'step': 1728, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:38.657264', 'step': 1728, 'epoch': 2} {'type': 'loss', 'content': 0.4415394365787506, 'timestamp': '2025-09-05 09:02:38.660657', 'step': 1729, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:38.867898', 'step': 1729, 'epoch': 2} {'type': 'loss', 'content': 0.330495148897171, 'timestamp': '2025-09-05 09:02:38.870259', 'step': 1730, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:39.068825', 'step': 1730, 'epoch': 2} {'type': 'loss', 'content': 0.30471497774124146, 'timestamp': '2025-09-05 09:02:39.070707', 'step': 1731, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:39.267010', 'step': 1731, 'epoch': 2} {'type': 'loss', 'content': 0.3218672275543213, 'timestamp': '2025-09-05 09:02:39.281489', 'step': 1732, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:39.475538', 'step': 1732, 'epoch': 2} {'type': 'loss', 'content': 0.467593252658844, 'timestamp': '2025-09-05 09:02:39.477300', 'step': 1733, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:39.726323', 'step': 1733, 'epoch': 2} {'type': 'loss', 'content': 0.37956318259239197, 'timestamp': '2025-09-05 09:02:39.728240', 'step': 1734, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:39.925216', 'step': 1734, 'epoch': 2} {'type': 'loss', 'content': 0.4981067478656769, 'timestamp': '2025-09-05 09:02:39.927497', 'step': 1735, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:40.135632', 'step': 1735, 'epoch': 2} {'type': 'loss', 'content': 0.31048834323883057, 'timestamp': '2025-09-05 09:02:40.150598', 'step': 1736, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:40.366202', 'step': 1736, 'epoch': 2} {'type': 'loss', 'content': 0.283828467130661, 'timestamp': '2025-09-05 09:02:40.368169', 'step': 1737, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:40.535557', 'step': 1737, 'epoch': 2} {'type': 'loss', 'content': 0.25476568937301636, 'timestamp': '2025-09-05 09:02:40.538151', 'step': 1738, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:40.752855', 'step': 1738, 'epoch': 2} {'type': 'loss', 'content': 0.23773322999477386, 'timestamp': '2025-09-05 09:02:40.754924', 'step': 1739, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:40.962448', 'step': 1739, 'epoch': 2} {'type': 'loss', 'content': 0.3678232729434967, 'timestamp': '2025-09-05 09:02:40.977673', 'step': 1740, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:02:46.695748', 'step': 1740, 'epoch': 2} {'type': 'pplx', 'content': 52.87610880309225, 'timestamp': '2025-09-05 09:02:46.697902', 'step': 1740, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:46.860812', 'step': 1740, 'epoch': 2} {'type': 'loss', 'content': 0.42199501395225525, 'timestamp': '2025-09-05 09:02:46.862869', 'step': 1741, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:47.030713', 'step': 1741, 'epoch': 2} {'type': 'loss', 'content': 0.27012646198272705, 'timestamp': '2025-09-05 09:02:47.048230', 'step': 1742, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:47.253601', 'step': 1742, 'epoch': 2} {'type': 'loss', 'content': 0.33601266145706177, 'timestamp': '2025-09-05 09:02:47.255971', 'step': 1743, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:47.450875', 'step': 1743, 'epoch': 2} {'type': 'loss', 'content': 0.34322261810302734, 'timestamp': '2025-09-05 09:02:47.466057', 'step': 1744, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:47.707178', 'step': 1744, 'epoch': 2} {'type': 'loss', 'content': 0.21854346990585327, 'timestamp': '2025-09-05 09:02:47.709562', 'step': 1745, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:47.915477', 'step': 1745, 'epoch': 2} {'type': 'loss', 'content': 0.2908678352832794, 'timestamp': '2025-09-05 09:02:47.917360', 'step': 1746, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:02:48.084021', 'step': 1746, 'epoch': 2} {'type': 'loss', 'content': 0.22152143716812134, 'timestamp': '2025-09-05 09:02:48.167207', 'step': 1747, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:48.412311', 'step': 1747, 'epoch': 2} {'type': 'loss', 'content': 0.3264712691307068, 'timestamp': '2025-09-05 09:02:48.427033', 'step': 1748, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:48.615841', 'step': 1748, 'epoch': 2} {'type': 'loss', 'content': 0.33076176047325134, 'timestamp': '2025-09-05 09:02:48.658513', 'step': 1749, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:02:48.907009', 'step': 1749, 'epoch': 2} {'type': 'loss', 'content': 0.29463064670562744, 'timestamp': '2025-09-05 09:02:48.909234', 'step': 1750, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:49.117191', 'step': 1750, 'epoch': 2} {'type': 'loss', 'content': 0.3289870023727417, 'timestamp': '2025-09-05 09:02:49.119123', 'step': 1751, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:49.314893', 'step': 1751, 'epoch': 2} {'type': 'loss', 'content': 0.35243523120880127, 'timestamp': '2025-09-05 09:02:49.329655', 'step': 1752, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:49.654407', 'step': 1752, 'epoch': 2} {'type': 'loss', 'content': 0.39635375142097473, 'timestamp': '2025-09-05 09:02:49.656285', 'step': 1753, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:49.861685', 'step': 1753, 'epoch': 2} {'type': 'loss', 'content': 0.23487235605716705, 'timestamp': '2025-09-05 09:02:49.864300', 'step': 1754, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:50.061153', 'step': 1754, 'epoch': 2} {'type': 'loss', 'content': 0.3939005136489868, 'timestamp': '2025-09-05 09:02:50.103946', 'step': 1755, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:50.438452', 'step': 1755, 'epoch': 2} {'type': 'loss', 'content': 0.3778200149536133, 'timestamp': '2025-09-05 09:02:50.448072', 'step': 1756, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:50.613169', 'step': 1756, 'epoch': 2} {'type': 'loss', 'content': 0.2921392619609833, 'timestamp': '2025-09-05 09:02:50.614850', 'step': 1757, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:02:50.819770', 'step': 1757, 'epoch': 2} {'type': 'loss', 'content': 0.24706591665744781, 'timestamp': '2025-09-05 09:02:50.821640', 'step': 1758, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:02:51.019320', 'step': 1758, 'epoch': 2} {'type': 'loss', 'content': 0.3144387900829315, 'timestamp': '2025-09-05 09:02:51.020986', 'step': 1759, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:02:51.218943', 'step': 1759, 'epoch': 2} {'type': 'loss', 'content': 0.45034703612327576, 'timestamp': '2025-09-05 09:02:51.233986', 'step': 1760, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:02:56.821336', 'step': 1760, 'epoch': 2} {'type': 'pplx', 'content': 53.89968272891614, 'timestamp': '2025-09-05 09:02:56.825297', 'step': 1760, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1760', 'timestamp': '2025-09-05 09:02:57.358500', 'step': 1760, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:57.525650', 'step': 1760, 'epoch': 2} {'type': 'loss', 'content': 0.2909316122531891, 'timestamp': '2025-09-05 09:02:57.527607', 'step': 1761, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:02:57.732796', 'step': 1761, 'epoch': 2} {'type': 'loss', 'content': 0.372611939907074, 'timestamp': '2025-09-05 09:02:57.734962', 'step': 1762, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:57.982758', 'step': 1762, 'epoch': 2} {'type': 'loss', 'content': 0.3195144534111023, 'timestamp': '2025-09-05 09:02:57.984997', 'step': 1763, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:58.191741', 'step': 1763, 'epoch': 2} {'type': 'loss', 'content': 0.2733082175254822, 'timestamp': '2025-09-05 09:02:58.206696', 'step': 1764, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:58.394732', 'step': 1764, 'epoch': 2} {'type': 'loss', 'content': 0.34646353125572205, 'timestamp': '2025-09-05 09:02:58.397168', 'step': 1765, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:02:58.604597', 'step': 1765, 'epoch': 2} {'type': 'loss', 'content': 0.2818068265914917, 'timestamp': '2025-09-05 09:02:58.666650', 'step': 1766, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:58.946660', 'step': 1766, 'epoch': 2} {'type': 'loss', 'content': 0.20438231527805328, 'timestamp': '2025-09-05 09:02:58.948538', 'step': 1767, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:59.147170', 'step': 1767, 'epoch': 2} {'type': 'loss', 'content': 0.2941116690635681, 'timestamp': '2025-09-05 09:02:59.157344', 'step': 1768, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:02:59.318970', 'step': 1768, 'epoch': 2} {'type': 'loss', 'content': 0.387058287858963, 'timestamp': '2025-09-05 09:02:59.341479', 'step': 1769, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:59.590735', 'step': 1769, 'epoch': 2} {'type': 'loss', 'content': 0.5514186024665833, 'timestamp': '2025-09-05 09:02:59.593040', 'step': 1770, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:02:59.789424', 'step': 1770, 'epoch': 2} {'type': 'loss', 'content': 0.39721089601516724, 'timestamp': '2025-09-05 09:02:59.791671', 'step': 1771, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:00.085156', 'step': 1771, 'epoch': 2} {'type': 'loss', 'content': 0.4141196608543396, 'timestamp': '2025-09-05 09:03:00.094213', 'step': 1772, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:00.255930', 'step': 1772, 'epoch': 2} {'type': 'loss', 'content': 0.3068144917488098, 'timestamp': '2025-09-05 09:03:00.257769', 'step': 1773, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:00.423547', 'step': 1773, 'epoch': 2} {'type': 'loss', 'content': 0.2862834632396698, 'timestamp': '2025-09-05 09:03:00.426909', 'step': 1774, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:03:00.636780', 'step': 1774, 'epoch': 2} {'type': 'loss', 'content': 0.5047730207443237, 'timestamp': '2025-09-05 09:03:00.639450', 'step': 1775, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:00.890445', 'step': 1775, 'epoch': 2} {'type': 'loss', 'content': 0.3867059648036957, 'timestamp': '2025-09-05 09:03:00.907081', 'step': 1776, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:01.102574', 'step': 1776, 'epoch': 2} {'type': 'loss', 'content': 0.32131898403167725, 'timestamp': '2025-09-05 09:03:01.104756', 'step': 1777, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:01.301984', 'step': 1777, 'epoch': 2} {'type': 'loss', 'content': 0.26770636439323425, 'timestamp': '2025-09-05 09:03:01.304878', 'step': 1778, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:01.554677', 'step': 1778, 'epoch': 2} {'type': 'loss', 'content': 0.15419012308120728, 'timestamp': '2025-09-05 09:03:01.556780', 'step': 1779, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:01.752071', 'step': 1779, 'epoch': 2} {'type': 'loss', 'content': 0.4039864242076874, 'timestamp': '2025-09-05 09:03:01.766230', 'step': 1780, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:03:07.403589', 'step': 1780, 'epoch': 2} {'type': 'pplx', 'content': 54.678229296229425, 'timestamp': '2025-09-05 09:03:07.406101', 'step': 1780, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:07.567589', 'step': 1780, 'epoch': 2} {'type': 'loss', 'content': 0.33174604177474976, 'timestamp': '2025-09-05 09:03:07.569717', 'step': 1781, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:07.776051', 'step': 1781, 'epoch': 2} {'type': 'loss', 'content': 0.36827021837234497, 'timestamp': '2025-09-05 09:03:07.778234', 'step': 1782, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:07.984836', 'step': 1782, 'epoch': 2} {'type': 'loss', 'content': 0.23714663088321686, 'timestamp': '2025-09-05 09:03:07.986846', 'step': 1783, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:03:08.184589', 'step': 1783, 'epoch': 2} {'type': 'loss', 'content': 0.34843164682388306, 'timestamp': '2025-09-05 09:03:08.198645', 'step': 1784, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:08.387243', 'step': 1784, 'epoch': 2} {'type': 'loss', 'content': 0.33095985651016235, 'timestamp': '2025-09-05 09:03:08.388990', 'step': 1785, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:08.554852', 'step': 1785, 'epoch': 2} {'type': 'loss', 'content': 0.34028369188308716, 'timestamp': '2025-09-05 09:03:08.559697', 'step': 1786, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:08.766857', 'step': 1786, 'epoch': 2} {'type': 'loss', 'content': 0.25933557748794556, 'timestamp': '2025-09-05 09:03:08.770226', 'step': 1787, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:09.021745', 'step': 1787, 'epoch': 2} {'type': 'loss', 'content': 0.3989739716053009, 'timestamp': '2025-09-05 09:03:09.035974', 'step': 1788, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:09.224379', 'step': 1788, 'epoch': 2} {'type': 'loss', 'content': 0.38663944602012634, 'timestamp': '2025-09-05 09:03:09.227234', 'step': 1789, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:09.430888', 'step': 1789, 'epoch': 2} {'type': 'loss', 'content': 0.17637419700622559, 'timestamp': '2025-09-05 09:03:09.432645', 'step': 1790, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:03:09.628726', 'step': 1790, 'epoch': 2} {'type': 'loss', 'content': 0.2469310164451599, 'timestamp': '2025-09-05 09:03:09.630565', 'step': 1791, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:03:09.826584', 'step': 1791, 'epoch': 2} {'type': 'loss', 'content': 0.3839358687400818, 'timestamp': '2025-09-05 09:03:09.840975', 'step': 1792, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:10.071018', 'step': 1792, 'epoch': 2} {'type': 'loss', 'content': 0.36574140191078186, 'timestamp': '2025-09-05 09:03:10.114734', 'step': 1793, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:10.319121', 'step': 1793, 'epoch': 2} {'type': 'loss', 'content': 0.3396137058734894, 'timestamp': '2025-09-05 09:03:10.321423', 'step': 1794, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:10.528211', 'step': 1794, 'epoch': 2} {'type': 'loss', 'content': 0.3768261671066284, 'timestamp': '2025-09-05 09:03:10.530196', 'step': 1795, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:10.775691', 'step': 1795, 'epoch': 2} {'type': 'loss', 'content': 0.3471960723400116, 'timestamp': '2025-09-05 09:03:10.791006', 'step': 1796, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:10.980600', 'step': 1796, 'epoch': 2} {'type': 'loss', 'content': 0.4404922425746918, 'timestamp': '2025-09-05 09:03:10.982368', 'step': 1797, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:03:11.274310', 'step': 1797, 'epoch': 2} {'type': 'loss', 'content': 0.2796347141265869, 'timestamp': '2025-09-05 09:03:11.276786', 'step': 1798, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:11.474461', 'step': 1798, 'epoch': 2} {'type': 'loss', 'content': 0.3296290338039398, 'timestamp': '2025-09-05 09:03:11.476700', 'step': 1799, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:11.682812', 'step': 1799, 'epoch': 2} {'type': 'loss', 'content': 0.2389107197523117, 'timestamp': '2025-09-05 09:03:11.699218', 'step': 1800, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:03:16.674241', 'step': 1800, 'epoch': 2} {'type': 'pplx', 'content': 55.036339384505084, 'timestamp': '2025-09-05 09:03:16.676009', 'step': 1800, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1800', 'timestamp': '2025-09-05 09:03:17.178926', 'step': 1800, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:17.379411', 'step': 1800, 'epoch': 2} {'type': 'loss', 'content': 0.32822126150131226, 'timestamp': '2025-09-05 09:03:17.382120', 'step': 1801, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:17.578801', 'step': 1801, 'epoch': 2} {'type': 'loss', 'content': 0.2833464443683624, 'timestamp': '2025-09-05 09:03:17.581895', 'step': 1802, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:17.781822', 'step': 1802, 'epoch': 2} {'type': 'loss', 'content': 0.3650655448436737, 'timestamp': '2025-09-05 09:03:17.784468', 'step': 1803, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:18.050377', 'step': 1803, 'epoch': 2} {'type': 'loss', 'content': 0.3336227238178253, 'timestamp': '2025-09-05 09:03:18.066982', 'step': 1804, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:03:18.267529', 'step': 1804, 'epoch': 2} {'type': 'loss', 'content': 0.3089102804660797, 'timestamp': '2025-09-05 09:03:18.270031', 'step': 1805, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:18.468268', 'step': 1805, 'epoch': 2} {'type': 'loss', 'content': 0.2924884259700775, 'timestamp': '2025-09-05 09:03:18.470148', 'step': 1806, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:18.665728', 'step': 1806, 'epoch': 2} {'type': 'loss', 'content': 0.25991401076316833, 'timestamp': '2025-09-05 09:03:18.667725', 'step': 1807, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:18.872934', 'step': 1807, 'epoch': 2} {'type': 'loss', 'content': 0.38074877858161926, 'timestamp': '2025-09-05 09:03:18.887103', 'step': 1808, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:19.075301', 'step': 1808, 'epoch': 2} {'type': 'loss', 'content': 0.2662460505962372, 'timestamp': '2025-09-05 09:03:19.077492', 'step': 1809, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:19.283635', 'step': 1809, 'epoch': 2} {'type': 'loss', 'content': 0.3200852572917938, 'timestamp': '2025-09-05 09:03:19.285866', 'step': 1810, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:03:19.514373', 'step': 1810, 'epoch': 2} {'type': 'loss', 'content': 0.23158332705497742, 'timestamp': '2025-09-05 09:03:19.516172', 'step': 1811, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:19.714236', 'step': 1811, 'epoch': 2} {'type': 'loss', 'content': 0.38919728994369507, 'timestamp': '2025-09-05 09:03:19.723741', 'step': 1812, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:19.892125', 'step': 1812, 'epoch': 2} {'type': 'loss', 'content': 0.38500481843948364, 'timestamp': '2025-09-05 09:03:19.894564', 'step': 1813, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:20.062946', 'step': 1813, 'epoch': 2} {'type': 'loss', 'content': 0.4389552175998688, 'timestamp': '2025-09-05 09:03:20.065351', 'step': 1814, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:20.262580', 'step': 1814, 'epoch': 2} {'type': 'loss', 'content': 0.4390960931777954, 'timestamp': '2025-09-05 09:03:20.265034', 'step': 1815, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:20.432565', 'step': 1815, 'epoch': 2} {'type': 'loss', 'content': 0.3098478615283966, 'timestamp': '2025-09-05 09:03:20.448133', 'step': 1816, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:20.639359', 'step': 1816, 'epoch': 2} {'type': 'loss', 'content': 0.28565704822540283, 'timestamp': '2025-09-05 09:03:20.641294', 'step': 1817, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:03:20.850568', 'step': 1817, 'epoch': 2} {'type': 'loss', 'content': 0.47005581855773926, 'timestamp': '2025-09-05 09:03:20.852541', 'step': 1818, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:21.020394', 'step': 1818, 'epoch': 2} {'type': 'loss', 'content': 0.38893210887908936, 'timestamp': '2025-09-05 09:03:21.022791', 'step': 1819, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:21.219082', 'step': 1819, 'epoch': 2} {'type': 'loss', 'content': 0.26181450486183167, 'timestamp': '2025-09-05 09:03:21.228380', 'step': 1820, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:03:25.920801', 'step': 1820, 'epoch': 2} {'type': 'pplx', 'content': 55.33664731691632, 'timestamp': '2025-09-05 09:03:25.923217', 'step': 1820, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:26.088428', 'step': 1820, 'epoch': 2} {'type': 'loss', 'content': 0.4290368854999542, 'timestamp': '2025-09-05 09:03:26.090971', 'step': 1821, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:26.256546', 'step': 1821, 'epoch': 2} {'type': 'loss', 'content': 0.3403278887271881, 'timestamp': '2025-09-05 09:03:26.272384', 'step': 1822, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:26.478670', 'step': 1822, 'epoch': 2} {'type': 'loss', 'content': 0.35543927550315857, 'timestamp': '2025-09-05 09:03:26.481017', 'step': 1823, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:26.678647', 'step': 1823, 'epoch': 2} {'type': 'loss', 'content': 0.36923372745513916, 'timestamp': '2025-09-05 09:03:26.696460', 'step': 1824, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:26.894759', 'step': 1824, 'epoch': 2} {'type': 'loss', 'content': 0.33264076709747314, 'timestamp': '2025-09-05 09:03:26.898813', 'step': 1825, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:27.095284', 'step': 1825, 'epoch': 2} {'type': 'loss', 'content': 0.2916991710662842, 'timestamp': '2025-09-05 09:03:27.097313', 'step': 1826, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:27.294793', 'step': 1826, 'epoch': 2} {'type': 'loss', 'content': 0.339844286441803, 'timestamp': '2025-09-05 09:03:27.296700', 'step': 1827, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:27.492683', 'step': 1827, 'epoch': 2} {'type': 'loss', 'content': 0.25583401322364807, 'timestamp': '2025-09-05 09:03:27.501965', 'step': 1828, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:03:27.667250', 'step': 1828, 'epoch': 2} {'type': 'loss', 'content': 0.2994399070739746, 'timestamp': '2025-09-05 09:03:27.669072', 'step': 1829, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:27.875984', 'step': 1829, 'epoch': 2} {'type': 'loss', 'content': 0.2248479723930359, 'timestamp': '2025-09-05 09:03:27.877941', 'step': 1830, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:28.074462', 'step': 1830, 'epoch': 2} {'type': 'loss', 'content': 0.44062700867652893, 'timestamp': '2025-09-05 09:03:28.076844', 'step': 1831, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:03:28.284610', 'step': 1831, 'epoch': 2} {'type': 'loss', 'content': 0.34865128993988037, 'timestamp': '2025-09-05 09:03:28.294083', 'step': 1832, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:28.457941', 'step': 1832, 'epoch': 2} {'type': 'loss', 'content': 0.3390546441078186, 'timestamp': '2025-09-05 09:03:28.459806', 'step': 1833, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:28.625641', 'step': 1833, 'epoch': 2} {'type': 'loss', 'content': 0.4029175639152527, 'timestamp': '2025-09-05 09:03:28.627560', 'step': 1834, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:28.833445', 'step': 1834, 'epoch': 2} {'type': 'loss', 'content': 0.29427599906921387, 'timestamp': '2025-09-05 09:03:28.835361', 'step': 1835, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:29.042402', 'step': 1835, 'epoch': 2} {'type': 'loss', 'content': 0.36329180002212524, 'timestamp': '2025-09-05 09:03:29.059551', 'step': 1836, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:03:29.261899', 'step': 1836, 'epoch': 2} {'type': 'loss', 'content': 0.24370358884334564, 'timestamp': '2025-09-05 09:03:29.264242', 'step': 1837, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 09:03:29.435765', 'step': 1837, 'epoch': 2} {'type': 'loss', 'content': 0.4641832411289215, 'timestamp': '2025-09-05 09:03:29.438505', 'step': 1838, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:03:29.645590', 'step': 1838, 'epoch': 2} {'type': 'loss', 'content': 0.27168017625808716, 'timestamp': '2025-09-05 09:03:29.648925', 'step': 1839, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:29.898200', 'step': 1839, 'epoch': 2} {'type': 'loss', 'content': 0.2619711458683014, 'timestamp': '2025-09-05 09:03:29.964385', 'step': 1840, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:03:34.650912', 'step': 1840, 'epoch': 2} {'type': 'pplx', 'content': 55.72341477187909, 'timestamp': '2025-09-05 09:03:34.653569', 'step': 1840, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1840', 'timestamp': '2025-09-05 09:03:35.179728', 'step': 1840, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:35.347722', 'step': 1840, 'epoch': 2} {'type': 'loss', 'content': 0.3577501177787781, 'timestamp': '2025-09-05 09:03:35.349827', 'step': 1841, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:35.556211', 'step': 1841, 'epoch': 2} {'type': 'loss', 'content': 0.3645785450935364, 'timestamp': '2025-09-05 09:03:35.558862', 'step': 1842, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:35.726141', 'step': 1842, 'epoch': 2} {'type': 'loss', 'content': 0.356271892786026, 'timestamp': '2025-09-05 09:03:35.728901', 'step': 1843, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:35.932810', 'step': 1843, 'epoch': 2} {'type': 'loss', 'content': 0.2203180342912674, 'timestamp': '2025-09-05 09:03:35.942768', 'step': 1844, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:36.107321', 'step': 1844, 'epoch': 2} {'type': 'loss', 'content': 0.4093642234802246, 'timestamp': '2025-09-05 09:03:36.109424', 'step': 1845, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:36.277831', 'step': 1845, 'epoch': 2} {'type': 'loss', 'content': 0.2778435945510864, 'timestamp': '2025-09-05 09:03:36.280627', 'step': 1846, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:36.449396', 'step': 1846, 'epoch': 2} {'type': 'loss', 'content': 0.3252573013305664, 'timestamp': '2025-09-05 09:03:36.452403', 'step': 1847, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:03:36.659098', 'step': 1847, 'epoch': 2} {'type': 'loss', 'content': 0.41045743227005005, 'timestamp': '2025-09-05 09:03:36.668901', 'step': 1848, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:36.833074', 'step': 1848, 'epoch': 2} {'type': 'loss', 'content': 0.38269633054733276, 'timestamp': '2025-09-05 09:03:36.836355', 'step': 1849, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:37.005419', 'step': 1849, 'epoch': 2} {'type': 'loss', 'content': 0.46076807379722595, 'timestamp': '2025-09-05 09:03:37.007421', 'step': 1850, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:37.174049', 'step': 1850, 'epoch': 2} {'type': 'loss', 'content': 0.40671306848526, 'timestamp': '2025-09-05 09:03:37.176204', 'step': 1851, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:37.373674', 'step': 1851, 'epoch': 2} {'type': 'loss', 'content': 0.19898702204227448, 'timestamp': '2025-09-05 09:03:37.384578', 'step': 1852, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:37.547587', 'step': 1852, 'epoch': 2} {'type': 'loss', 'content': 0.21051372587680817, 'timestamp': '2025-09-05 09:03:37.549476', 'step': 1853, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:03:37.758174', 'step': 1853, 'epoch': 2} {'type': 'loss', 'content': 0.17444637417793274, 'timestamp': '2025-09-05 09:03:37.760015', 'step': 1854, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:38.011527', 'step': 1854, 'epoch': 2} {'type': 'loss', 'content': 0.25767529010772705, 'timestamp': '2025-09-05 09:03:38.013700', 'step': 1855, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:38.211086', 'step': 1855, 'epoch': 2} {'type': 'loss', 'content': 0.283276230096817, 'timestamp': '2025-09-05 09:03:38.220740', 'step': 1856, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:38.384710', 'step': 1856, 'epoch': 2} {'type': 'loss', 'content': 0.36334753036499023, 'timestamp': '2025-09-05 09:03:38.387474', 'step': 1857, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:38.555920', 'step': 1857, 'epoch': 2} {'type': 'loss', 'content': 0.24148380756378174, 'timestamp': '2025-09-05 09:03:38.558779', 'step': 1858, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:03:38.758088', 'step': 1858, 'epoch': 2} {'type': 'loss', 'content': 0.2839478552341461, 'timestamp': '2025-09-05 09:03:38.760496', 'step': 1859, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:38.956342', 'step': 1859, 'epoch': 2} {'type': 'loss', 'content': 0.2954237163066864, 'timestamp': '2025-09-05 09:03:38.973295', 'step': 1860, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:03:43.752815', 'step': 1860, 'epoch': 2} {'type': 'pplx', 'content': 55.779841573659716, 'timestamp': '2025-09-05 09:03:43.755396', 'step': 1860, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:43.917975', 'step': 1860, 'epoch': 2} {'type': 'loss', 'content': 0.25287219882011414, 'timestamp': '2025-09-05 09:03:43.920185', 'step': 1861, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:44.125292', 'step': 1861, 'epoch': 2} {'type': 'loss', 'content': 0.36329612135887146, 'timestamp': '2025-09-05 09:03:44.127298', 'step': 1862, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:44.334342', 'step': 1862, 'epoch': 2} {'type': 'loss', 'content': 0.214580699801445, 'timestamp': '2025-09-05 09:03:44.336143', 'step': 1863, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:44.543600', 'step': 1863, 'epoch': 2} {'type': 'loss', 'content': 0.32577669620513916, 'timestamp': '2025-09-05 09:03:44.560339', 'step': 1864, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:03:44.759134', 'step': 1864, 'epoch': 2} {'type': 'loss', 'content': 0.45734408497810364, 'timestamp': '2025-09-05 09:03:44.761720', 'step': 1865, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:44.967935', 'step': 1865, 'epoch': 2} {'type': 'loss', 'content': 0.3126198947429657, 'timestamp': '2025-09-05 09:03:44.970550', 'step': 1866, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:45.140680', 'step': 1866, 'epoch': 2} {'type': 'loss', 'content': 0.2847732603549957, 'timestamp': '2025-09-05 09:03:45.142554', 'step': 1867, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:45.349745', 'step': 1867, 'epoch': 2} {'type': 'loss', 'content': 0.25974026322364807, 'timestamp': '2025-09-05 09:03:45.359453', 'step': 1868, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:45.559114', 'step': 1868, 'epoch': 2} {'type': 'loss', 'content': 0.3027217388153076, 'timestamp': '2025-09-05 09:03:45.561007', 'step': 1869, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:45.767401', 'step': 1869, 'epoch': 2} {'type': 'loss', 'content': 0.2498902678489685, 'timestamp': '2025-09-05 09:03:45.769305', 'step': 1870, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:45.966695', 'step': 1870, 'epoch': 2} {'type': 'loss', 'content': 0.3127624988555908, 'timestamp': '2025-09-05 09:03:45.969508', 'step': 1871, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:46.181681', 'step': 1871, 'epoch': 2} {'type': 'loss', 'content': 0.298271507024765, 'timestamp': '2025-09-05 09:03:46.199839', 'step': 1872, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:46.481768', 'step': 1872, 'epoch': 2} {'type': 'loss', 'content': 0.31834274530410767, 'timestamp': '2025-09-05 09:03:46.483757', 'step': 1873, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:46.783137', 'step': 1873, 'epoch': 2} {'type': 'loss', 'content': 0.264957994222641, 'timestamp': '2025-09-05 09:03:46.799236', 'step': 1874, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:47.018791', 'step': 1874, 'epoch': 2} {'type': 'loss', 'content': 0.43002596497535706, 'timestamp': '2025-09-05 09:03:47.020698', 'step': 1875, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:47.227210', 'step': 1875, 'epoch': 2} {'type': 'loss', 'content': 0.4498365819454193, 'timestamp': '2025-09-05 09:03:47.242075', 'step': 1876, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:47.496978', 'step': 1876, 'epoch': 2} {'type': 'loss', 'content': 0.21851421892642975, 'timestamp': '2025-09-05 09:03:47.498804', 'step': 1877, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:47.665836', 'step': 1877, 'epoch': 2} {'type': 'loss', 'content': 0.3184349238872528, 'timestamp': '2025-09-05 09:03:47.667876', 'step': 1878, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:47.871856', 'step': 1878, 'epoch': 2} {'type': 'loss', 'content': 0.33598777651786804, 'timestamp': '2025-09-05 09:03:47.873933', 'step': 1879, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:48.041864', 'step': 1879, 'epoch': 2} {'type': 'loss', 'content': 0.31021371483802795, 'timestamp': '2025-09-05 09:03:48.058333', 'step': 1880, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:03:52.719413', 'step': 1880, 'epoch': 2} {'type': 'pplx', 'content': 55.9173870399914, 'timestamp': '2025-09-05 09:03:52.721544', 'step': 1880, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1880', 'timestamp': '2025-09-05 09:03:53.184102', 'step': 1880, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:53.347637', 'step': 1880, 'epoch': 2} {'type': 'loss', 'content': 0.35257551074028015, 'timestamp': '2025-09-05 09:03:53.349442', 'step': 1881, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:53.516039', 'step': 1881, 'epoch': 2} {'type': 'loss', 'content': 0.41870570182800293, 'timestamp': '2025-09-05 09:03:53.518151', 'step': 1882, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:03:53.685692', 'step': 1882, 'epoch': 2} {'type': 'loss', 'content': 0.33617204427719116, 'timestamp': '2025-09-05 09:03:53.688722', 'step': 1883, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:53.883855', 'step': 1883, 'epoch': 2} {'type': 'loss', 'content': 0.4209844172000885, 'timestamp': '2025-09-05 09:03:53.899544', 'step': 1884, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:54.089537', 'step': 1884, 'epoch': 2} {'type': 'loss', 'content': 0.27658504247665405, 'timestamp': '2025-09-05 09:03:54.097167', 'step': 1885, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:54.432684', 'step': 1885, 'epoch': 2} {'type': 'loss', 'content': 0.30306634306907654, 'timestamp': '2025-09-05 09:03:54.434610', 'step': 1886, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:54.601164', 'step': 1886, 'epoch': 2} {'type': 'loss', 'content': 0.3418419361114502, 'timestamp': '2025-09-05 09:03:54.603665', 'step': 1887, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:54.806396', 'step': 1887, 'epoch': 2} {'type': 'loss', 'content': 0.24061523377895355, 'timestamp': '2025-09-05 09:03:54.822115', 'step': 1888, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:55.010753', 'step': 1888, 'epoch': 2} {'type': 'loss', 'content': 0.3230704665184021, 'timestamp': '2025-09-05 09:03:55.013738', 'step': 1889, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:55.218951', 'step': 1889, 'epoch': 2} {'type': 'loss', 'content': 0.28835007548332214, 'timestamp': '2025-09-05 09:03:55.221346', 'step': 1890, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:55.387695', 'step': 1890, 'epoch': 2} {'type': 'loss', 'content': 0.41431716084480286, 'timestamp': '2025-09-05 09:03:55.390377', 'step': 1891, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:03:55.586573', 'step': 1891, 'epoch': 2} {'type': 'loss', 'content': 0.19797471165657043, 'timestamp': '2025-09-05 09:03:55.597127', 'step': 1892, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:55.759697', 'step': 1892, 'epoch': 2} {'type': 'loss', 'content': 0.3431963324546814, 'timestamp': '2025-09-05 09:03:55.762708', 'step': 1893, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:55.971229', 'step': 1893, 'epoch': 2} {'type': 'loss', 'content': 0.3090471923351288, 'timestamp': '2025-09-05 09:03:55.973593', 'step': 1894, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:03:56.142593', 'step': 1894, 'epoch': 2} {'type': 'loss', 'content': 0.2430524230003357, 'timestamp': '2025-09-05 09:03:56.145717', 'step': 1895, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:56.352947', 'step': 1895, 'epoch': 2} {'type': 'loss', 'content': 0.23343847692012787, 'timestamp': '2025-09-05 09:03:56.365935', 'step': 1896, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:56.529315', 'step': 1896, 'epoch': 2} {'type': 'loss', 'content': 0.3793867826461792, 'timestamp': '2025-09-05 09:03:56.531878', 'step': 1897, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:03:56.698816', 'step': 1897, 'epoch': 2} {'type': 'loss', 'content': 0.37744349241256714, 'timestamp': '2025-09-05 09:03:56.701645', 'step': 1898, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:03:56.871278', 'step': 1898, 'epoch': 2} {'type': 'loss', 'content': 0.22595612704753876, 'timestamp': '2025-09-05 09:03:56.874840', 'step': 1899, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:03:57.081380', 'step': 1899, 'epoch': 2} {'type': 'loss', 'content': 0.25911441445350647, 'timestamp': '2025-09-05 09:03:57.096490', 'step': 1900, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:04:01.883995', 'step': 1900, 'epoch': 2} {'type': 'pplx', 'content': 55.85029067024223, 'timestamp': '2025-09-05 09:04:01.886021', 'step': 1900, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:04:02.049119', 'step': 1900, 'epoch': 2} {'type': 'loss', 'content': 0.31988221406936646, 'timestamp': '2025-09-05 09:04:02.050787', 'step': 1901, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:02.256430', 'step': 1901, 'epoch': 2} {'type': 'loss', 'content': 0.2469642162322998, 'timestamp': '2025-09-05 09:04:02.258336', 'step': 1902, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:02.428352', 'step': 1902, 'epoch': 2} {'type': 'loss', 'content': 0.35464945435523987, 'timestamp': '2025-09-05 09:04:02.430549', 'step': 1903, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:02.629860', 'step': 1903, 'epoch': 2} {'type': 'loss', 'content': 0.4780825972557068, 'timestamp': '2025-09-05 09:04:02.640245', 'step': 1904, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:04:02.801940', 'step': 1904, 'epoch': 2} {'type': 'loss', 'content': 0.2757084369659424, 'timestamp': '2025-09-05 09:04:02.803938', 'step': 1905, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:02.972141', 'step': 1905, 'epoch': 2} {'type': 'loss', 'content': 0.1613525003194809, 'timestamp': '2025-09-05 09:04:02.974528', 'step': 1906, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:03.181297', 'step': 1906, 'epoch': 2} {'type': 'loss', 'content': 0.36126402020454407, 'timestamp': '2025-09-05 09:04:03.183198', 'step': 1907, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:03.355002', 'step': 1907, 'epoch': 2} {'type': 'loss', 'content': 0.49422934651374817, 'timestamp': '2025-09-05 09:04:03.370219', 'step': 1908, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:03.563094', 'step': 1908, 'epoch': 2} {'type': 'loss', 'content': 0.40730562806129456, 'timestamp': '2025-09-05 09:04:03.564993', 'step': 1909, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:03.733709', 'step': 1909, 'epoch': 2} {'type': 'loss', 'content': 0.3607218265533447, 'timestamp': '2025-09-05 09:04:03.736360', 'step': 1910, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:03.931318', 'step': 1910, 'epoch': 2} {'type': 'loss', 'content': 0.3690507113933563, 'timestamp': '2025-09-05 09:04:03.933114', 'step': 1911, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:04:04.104801', 'step': 1911, 'epoch': 2} {'type': 'loss', 'content': 0.2338135987520218, 'timestamp': '2025-09-05 09:04:04.114485', 'step': 1912, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:04.278006', 'step': 1912, 'epoch': 2} {'type': 'loss', 'content': 0.46766653656959534, 'timestamp': '2025-09-05 09:04:04.279765', 'step': 1913, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:04.449674', 'step': 1913, 'epoch': 2} {'type': 'loss', 'content': 0.36028924584388733, 'timestamp': '2025-09-05 09:04:04.451446', 'step': 1914, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:04.618848', 'step': 1914, 'epoch': 2} {'type': 'loss', 'content': 0.4140165448188782, 'timestamp': '2025-09-05 09:04:04.621294', 'step': 1915, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:04.819558', 'step': 1915, 'epoch': 2} {'type': 'loss', 'content': 0.2273964136838913, 'timestamp': '2025-09-05 09:04:04.828828', 'step': 1916, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:04.994269', 'step': 1916, 'epoch': 2} {'type': 'loss', 'content': 0.3443259596824646, 'timestamp': '2025-09-05 09:04:04.997681', 'step': 1917, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:05.165754', 'step': 1917, 'epoch': 2} {'type': 'loss', 'content': 0.30746108293533325, 'timestamp': '2025-09-05 09:04:05.167548', 'step': 1918, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:05.335215', 'step': 1918, 'epoch': 2} {'type': 'loss', 'content': 0.2145860493183136, 'timestamp': '2025-09-05 09:04:05.338280', 'step': 1919, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:05.544231', 'step': 1919, 'epoch': 2} {'type': 'loss', 'content': 0.3346792459487915, 'timestamp': '2025-09-05 09:04:05.558510', 'step': 1920, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:04:10.351935', 'step': 1920, 'epoch': 2} {'type': 'pplx', 'content': 56.15066865394947, 'timestamp': '2025-09-05 09:04:10.353809', 'step': 1920, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1920', 'timestamp': '2025-09-05 09:04:10.783963', 'step': 1920, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:10.952777', 'step': 1920, 'epoch': 2} {'type': 'loss', 'content': 0.34421074390411377, 'timestamp': '2025-09-05 09:04:10.954701', 'step': 1921, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:11.123289', 'step': 1921, 'epoch': 2} {'type': 'loss', 'content': 0.27249059081077576, 'timestamp': '2025-09-05 09:04:11.125101', 'step': 1922, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:11.292348', 'step': 1922, 'epoch': 2} {'type': 'loss', 'content': 0.2681417465209961, 'timestamp': '2025-09-05 09:04:11.295915', 'step': 1923, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:04:11.506679', 'step': 1923, 'epoch': 2} {'type': 'loss', 'content': 0.4062962234020233, 'timestamp': '2025-09-05 09:04:11.516326', 'step': 1924, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:11.678786', 'step': 1924, 'epoch': 2} {'type': 'loss', 'content': 0.3572784960269928, 'timestamp': '2025-09-05 09:04:11.680783', 'step': 1925, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:04:11.848575', 'step': 1925, 'epoch': 2} {'type': 'loss', 'content': 0.3298678994178772, 'timestamp': '2025-09-05 09:04:11.850534', 'step': 1926, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:12.045465', 'step': 1926, 'epoch': 2} {'type': 'loss', 'content': 0.17969104647636414, 'timestamp': '2025-09-05 09:04:12.047081', 'step': 1927, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:12.213555', 'step': 1927, 'epoch': 2} {'type': 'loss', 'content': 0.27280840277671814, 'timestamp': '2025-09-05 09:04:12.227905', 'step': 1928, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:12.417495', 'step': 1928, 'epoch': 2} {'type': 'loss', 'content': 0.3779914379119873, 'timestamp': '2025-09-05 09:04:12.420129', 'step': 1929, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:12.587001', 'step': 1929, 'epoch': 2} {'type': 'loss', 'content': 0.32336172461509705, 'timestamp': '2025-09-05 09:04:12.589142', 'step': 1930, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:12.796003', 'step': 1930, 'epoch': 2} {'type': 'loss', 'content': 0.5328038930892944, 'timestamp': '2025-09-05 09:04:12.802014', 'step': 1931, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:13.012545', 'step': 1931, 'epoch': 2} {'type': 'loss', 'content': 0.2591715157032013, 'timestamp': '2025-09-05 09:04:13.022339', 'step': 1932, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:13.186085', 'step': 1932, 'epoch': 2} {'type': 'loss', 'content': 0.39072778820991516, 'timestamp': '2025-09-05 09:04:13.188276', 'step': 1933, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:13.357480', 'step': 1933, 'epoch': 2} {'type': 'loss', 'content': 0.20760498940944672, 'timestamp': '2025-09-05 09:04:13.359460', 'step': 1934, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:13.528291', 'step': 1934, 'epoch': 2} {'type': 'loss', 'content': 0.2971227467060089, 'timestamp': '2025-09-05 09:04:13.530536', 'step': 1935, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:13.728107', 'step': 1935, 'epoch': 2} {'type': 'loss', 'content': 0.3519618809223175, 'timestamp': '2025-09-05 09:04:13.743615', 'step': 1936, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:13.935962', 'step': 1936, 'epoch': 2} {'type': 'loss', 'content': 0.4732755720615387, 'timestamp': '2025-09-05 09:04:13.937754', 'step': 1937, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:14.144175', 'step': 1937, 'epoch': 2} {'type': 'loss', 'content': 0.22129914164543152, 'timestamp': '2025-09-05 09:04:14.146537', 'step': 1938, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:14.315053', 'step': 1938, 'epoch': 2} {'type': 'loss', 'content': 0.27725905179977417, 'timestamp': '2025-09-05 09:04:14.317471', 'step': 1939, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:14.483480', 'step': 1939, 'epoch': 2} {'type': 'loss', 'content': 0.30071911215782166, 'timestamp': '2025-09-05 09:04:14.498059', 'step': 1940, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:04:19.889789', 'step': 1940, 'epoch': 2} {'type': 'pplx', 'content': 56.40086004465284, 'timestamp': '2025-09-05 09:04:19.897148', 'step': 1940, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:20.060950', 'step': 1940, 'epoch': 2} {'type': 'loss', 'content': 0.47121545672416687, 'timestamp': '2025-09-05 09:04:20.063252', 'step': 1941, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:20.230989', 'step': 1941, 'epoch': 2} {'type': 'loss', 'content': 0.2984376847743988, 'timestamp': '2025-09-05 09:04:20.233513', 'step': 1942, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:20.440935', 'step': 1942, 'epoch': 2} {'type': 'loss', 'content': 0.2947276532649994, 'timestamp': '2025-09-05 09:04:20.443535', 'step': 1943, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:20.611995', 'step': 1943, 'epoch': 2} {'type': 'loss', 'content': 0.3717845380306244, 'timestamp': '2025-09-05 09:04:20.628113', 'step': 1944, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:20.816134', 'step': 1944, 'epoch': 2} {'type': 'loss', 'content': 0.27086225152015686, 'timestamp': '2025-09-05 09:04:20.818437', 'step': 1945, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:20.986429', 'step': 1945, 'epoch': 2} {'type': 'loss', 'content': 0.36322546005249023, 'timestamp': '2025-09-05 09:04:20.989100', 'step': 1946, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:21.186891', 'step': 1946, 'epoch': 2} {'type': 'loss', 'content': 0.3279218077659607, 'timestamp': '2025-09-05 09:04:21.188653', 'step': 1947, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:21.356516', 'step': 1947, 'epoch': 2} {'type': 'loss', 'content': 0.3289004862308502, 'timestamp': '2025-09-05 09:04:21.371172', 'step': 1948, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:21.559646', 'step': 1948, 'epoch': 2} {'type': 'loss', 'content': 0.23576058447360992, 'timestamp': '2025-09-05 09:04:21.561647', 'step': 1949, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:21.767476', 'step': 1949, 'epoch': 2} {'type': 'loss', 'content': 0.4314030408859253, 'timestamp': '2025-09-05 09:04:21.769580', 'step': 1950, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:21.975210', 'step': 1950, 'epoch': 2} {'type': 'loss', 'content': 0.2590286135673523, 'timestamp': '2025-09-05 09:04:21.977268', 'step': 1951, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:22.143053', 'step': 1951, 'epoch': 2} {'type': 'loss', 'content': 0.3938618004322052, 'timestamp': '2025-09-05 09:04:22.158294', 'step': 1952, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:04:22.380932', 'step': 1952, 'epoch': 2} {'type': 'loss', 'content': 0.41767221689224243, 'timestamp': '2025-09-05 09:04:22.383122', 'step': 1953, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:22.602652', 'step': 1953, 'epoch': 2} {'type': 'loss', 'content': 0.25870412588119507, 'timestamp': '2025-09-05 09:04:22.605208', 'step': 1954, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:22.771168', 'step': 1954, 'epoch': 2} {'type': 'loss', 'content': 0.37756073474884033, 'timestamp': '2025-09-05 09:04:22.774225', 'step': 1955, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:22.982874', 'step': 1955, 'epoch': 2} {'type': 'loss', 'content': 0.3379029631614685, 'timestamp': '2025-09-05 09:04:22.993241', 'step': 1956, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:23.157972', 'step': 1956, 'epoch': 2} {'type': 'loss', 'content': 0.26806700229644775, 'timestamp': '2025-09-05 09:04:23.160348', 'step': 1957, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:23.326717', 'step': 1957, 'epoch': 2} {'type': 'loss', 'content': 0.2842908203601837, 'timestamp': '2025-09-05 09:04:23.329832', 'step': 1958, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:23.497903', 'step': 1958, 'epoch': 2} {'type': 'loss', 'content': 0.45978856086730957, 'timestamp': '2025-09-05 09:04:23.500376', 'step': 1959, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:23.698055', 'step': 1959, 'epoch': 2} {'type': 'loss', 'content': 0.23071345686912537, 'timestamp': '2025-09-05 09:04:23.708192', 'step': 1960, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:04:28.419098', 'step': 1960, 'epoch': 2} {'type': 'pplx', 'content': 56.53060956108376, 'timestamp': '2025-09-05 09:04:28.421195', 'step': 1960, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1960', 'timestamp': '2025-09-05 09:04:28.888174', 'step': 1960, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:29.072136', 'step': 1960, 'epoch': 2} {'type': 'loss', 'content': 0.2766428589820862, 'timestamp': '2025-09-05 09:04:29.074140', 'step': 1961, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:29.275781', 'step': 1961, 'epoch': 2} {'type': 'loss', 'content': 0.4845949113368988, 'timestamp': '2025-09-05 09:04:29.277972', 'step': 1962, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:29.473812', 'step': 1962, 'epoch': 2} {'type': 'loss', 'content': 0.28940051794052124, 'timestamp': '2025-09-05 09:04:29.475571', 'step': 1963, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:29.642041', 'step': 1963, 'epoch': 2} {'type': 'loss', 'content': 0.3159952163696289, 'timestamp': '2025-09-05 09:04:29.659688', 'step': 1964, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:04:29.853746', 'step': 1964, 'epoch': 2} {'type': 'loss', 'content': 0.3902105391025543, 'timestamp': '2025-09-05 09:04:29.855230', 'step': 1965, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:30.058508', 'step': 1965, 'epoch': 2} {'type': 'loss', 'content': 0.2579273283481598, 'timestamp': '2025-09-05 09:04:30.060888', 'step': 1966, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:30.260907', 'step': 1966, 'epoch': 2} {'type': 'loss', 'content': 0.28875139355659485, 'timestamp': '2025-09-05 09:04:30.263070', 'step': 1967, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:30.530524', 'step': 1967, 'epoch': 2} {'type': 'loss', 'content': 0.3324761390686035, 'timestamp': '2025-09-05 09:04:30.540760', 'step': 1968, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:30.707202', 'step': 1968, 'epoch': 2} {'type': 'loss', 'content': 0.3139239251613617, 'timestamp': '2025-09-05 09:04:30.709065', 'step': 1969, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:30.877108', 'step': 1969, 'epoch': 2} {'type': 'loss', 'content': 0.2467249482870102, 'timestamp': '2025-09-05 09:04:30.879338', 'step': 1970, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:31.085452', 'step': 1970, 'epoch': 2} {'type': 'loss', 'content': 0.27524515986442566, 'timestamp': '2025-09-05 09:04:31.087634', 'step': 1971, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:31.294610', 'step': 1971, 'epoch': 2} {'type': 'loss', 'content': 0.2604565918445587, 'timestamp': '2025-09-05 09:04:31.304908', 'step': 1972, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:31.468728', 'step': 1972, 'epoch': 2} {'type': 'loss', 'content': 0.3006044030189514, 'timestamp': '2025-09-05 09:04:31.470507', 'step': 1973, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:31.656392', 'step': 1973, 'epoch': 2} {'type': 'loss', 'content': 0.3287713825702667, 'timestamp': '2025-09-05 09:04:31.658541', 'step': 1974, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:31.857546', 'step': 1974, 'epoch': 2} {'type': 'loss', 'content': 0.3500474989414215, 'timestamp': '2025-09-05 09:04:31.860148', 'step': 1975, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:32.066595', 'step': 1975, 'epoch': 2} {'type': 'loss', 'content': 0.24871431291103363, 'timestamp': '2025-09-05 09:04:32.081341', 'step': 1976, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:32.271920', 'step': 1976, 'epoch': 2} {'type': 'loss', 'content': 0.3919868469238281, 'timestamp': '2025-09-05 09:04:32.273912', 'step': 1977, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:32.440681', 'step': 1977, 'epoch': 2} {'type': 'loss', 'content': 0.2948307693004608, 'timestamp': '2025-09-05 09:04:32.442952', 'step': 1978, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:32.648995', 'step': 1978, 'epoch': 2} {'type': 'loss', 'content': 0.4514521062374115, 'timestamp': '2025-09-05 09:04:32.651170', 'step': 1979, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:04:32.856832', 'step': 1979, 'epoch': 2} {'type': 'loss', 'content': 0.3044537305831909, 'timestamp': '2025-09-05 09:04:32.871644', 'step': 1980, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:04:37.538105', 'step': 1980, 'epoch': 2} {'type': 'pplx', 'content': 56.71036435354335, 'timestamp': '2025-09-05 09:04:37.539835', 'step': 1980, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:37.703241', 'step': 1980, 'epoch': 2} {'type': 'loss', 'content': 0.29383978247642517, 'timestamp': '2025-09-05 09:04:37.705202', 'step': 1981, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:37.873478', 'step': 1981, 'epoch': 2} {'type': 'loss', 'content': 0.4132064878940582, 'timestamp': '2025-09-05 09:04:37.876394', 'step': 1982, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:38.043828', 'step': 1982, 'epoch': 2} {'type': 'loss', 'content': 0.1803075671195984, 'timestamp': '2025-09-05 09:04:38.046427', 'step': 1983, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:38.241865', 'step': 1983, 'epoch': 2} {'type': 'loss', 'content': 0.411940336227417, 'timestamp': '2025-09-05 09:04:38.257614', 'step': 1984, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 09:04:38.446287', 'step': 1984, 'epoch': 2} {'type': 'loss', 'content': 0.3590596616268158, 'timestamp': '2025-09-05 09:04:38.448991', 'step': 1985, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:38.645695', 'step': 1985, 'epoch': 2} {'type': 'loss', 'content': 0.28954413533210754, 'timestamp': '2025-09-05 09:04:38.647436', 'step': 1986, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:04:38.812350', 'step': 1986, 'epoch': 2} {'type': 'loss', 'content': 0.25408488512039185, 'timestamp': '2025-09-05 09:04:38.814460', 'step': 1987, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:39.009224', 'step': 1987, 'epoch': 2} {'type': 'loss', 'content': 0.18480801582336426, 'timestamp': '2025-09-05 09:04:39.018433', 'step': 1988, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:39.182413', 'step': 1988, 'epoch': 2} {'type': 'loss', 'content': 0.305034339427948, 'timestamp': '2025-09-05 09:04:39.184267', 'step': 1989, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:39.390607', 'step': 1989, 'epoch': 2} {'type': 'loss', 'content': 0.23244498670101166, 'timestamp': '2025-09-05 09:04:39.392494', 'step': 1990, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:39.598816', 'step': 1990, 'epoch': 2} {'type': 'loss', 'content': 0.40569478273391724, 'timestamp': '2025-09-05 09:04:39.600817', 'step': 1991, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:39.768888', 'step': 1991, 'epoch': 2} {'type': 'loss', 'content': 0.3343004882335663, 'timestamp': '2025-09-05 09:04:39.784051', 'step': 1992, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:39.972490', 'step': 1992, 'epoch': 2} {'type': 'loss', 'content': 0.2612588405609131, 'timestamp': '2025-09-05 09:04:39.974222', 'step': 1993, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 09:04:40.142246', 'step': 1993, 'epoch': 2} {'type': 'loss', 'content': 0.3457273542881012, 'timestamp': '2025-09-05 09:04:40.144612', 'step': 1994, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:40.311740', 'step': 1994, 'epoch': 2} {'type': 'loss', 'content': 0.32286980748176575, 'timestamp': '2025-09-05 09:04:40.314822', 'step': 1995, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:40.521126', 'step': 1995, 'epoch': 2} {'type': 'loss', 'content': 0.32212406396865845, 'timestamp': '2025-09-05 09:04:40.530489', 'step': 1996, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:40.694811', 'step': 1996, 'epoch': 2} {'type': 'loss', 'content': 0.35238468647003174, 'timestamp': '2025-09-05 09:04:40.697027', 'step': 1997, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:40.864697', 'step': 1997, 'epoch': 2} {'type': 'loss', 'content': 0.3432472050189972, 'timestamp': '2025-09-05 09:04:40.866865', 'step': 1998, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:41.064125', 'step': 1998, 'epoch': 2} {'type': 'loss', 'content': 0.30503416061401367, 'timestamp': '2025-09-05 09:04:41.066609', 'step': 1999, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:41.234458', 'step': 1999, 'epoch': 2} {'type': 'loss', 'content': 0.26616600155830383, 'timestamp': '2025-09-05 09:04:41.248958', 'step': 2000, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:04:46.035115', 'step': 2000, 'epoch': 2} {'type': 'pplx', 'content': 57.183245029940046, 'timestamp': '2025-09-05 09:04:46.039931', 'step': 2000, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2000', 'timestamp': '2025-09-05 09:04:46.512943', 'step': 2000, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:46.680863', 'step': 2000, 'epoch': 2} {'type': 'loss', 'content': 0.27458614110946655, 'timestamp': '2025-09-05 09:04:46.683153', 'step': 2001, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:46.878428', 'step': 2001, 'epoch': 2} {'type': 'loss', 'content': 0.3308562934398651, 'timestamp': '2025-09-05 09:04:46.880173', 'step': 2002, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:47.085991', 'step': 2002, 'epoch': 2} {'type': 'loss', 'content': 0.4302447736263275, 'timestamp': '2025-09-05 09:04:47.089966', 'step': 2003, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:47.289790', 'step': 2003, 'epoch': 2} {'type': 'loss', 'content': 0.23679831624031067, 'timestamp': '2025-09-05 09:04:47.299469', 'step': 2004, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:47.462911', 'step': 2004, 'epoch': 2} {'type': 'loss', 'content': 0.3112923204898834, 'timestamp': '2025-09-05 09:04:47.465129', 'step': 2005, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:47.632457', 'step': 2005, 'epoch': 2} {'type': 'loss', 'content': 0.30558183789253235, 'timestamp': '2025-09-05 09:04:47.634451', 'step': 2006, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:47.839869', 'step': 2006, 'epoch': 2} {'type': 'loss', 'content': 0.3617997467517853, 'timestamp': '2025-09-05 09:04:47.842181', 'step': 2007, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:48.008505', 'step': 2007, 'epoch': 2} {'type': 'loss', 'content': 0.23063942790031433, 'timestamp': '2025-09-05 09:04:48.025193', 'step': 2008, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:48.221927', 'step': 2008, 'epoch': 2} {'type': 'loss', 'content': 0.36768314242362976, 'timestamp': '2025-09-05 09:04:48.224032', 'step': 2009, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:48.391869', 'step': 2009, 'epoch': 2} {'type': 'loss', 'content': 0.3346289396286011, 'timestamp': '2025-09-05 09:04:48.393707', 'step': 2010, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:04:48.590531', 'step': 2010, 'epoch': 2} {'type': 'loss', 'content': 0.3734259605407715, 'timestamp': '2025-09-05 09:04:48.595428', 'step': 2011, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:48.810951', 'step': 2011, 'epoch': 2} {'type': 'loss', 'content': 0.3368525505065918, 'timestamp': '2025-09-05 09:04:48.825300', 'step': 2012, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:49.013722', 'step': 2012, 'epoch': 2} {'type': 'loss', 'content': 0.3550032377243042, 'timestamp': '2025-09-05 09:04:49.015412', 'step': 2013, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:49.220262', 'step': 2013, 'epoch': 2} {'type': 'loss', 'content': 0.42761796712875366, 'timestamp': '2025-09-05 09:04:49.222077', 'step': 2014, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:49.428047', 'step': 2014, 'epoch': 2} {'type': 'loss', 'content': 0.23654963076114655, 'timestamp': '2025-09-05 09:04:49.430134', 'step': 2015, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:49.596867', 'step': 2015, 'epoch': 2} {'type': 'loss', 'content': 0.2988823354244232, 'timestamp': '2025-09-05 09:04:49.611145', 'step': 2016, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:49.799003', 'step': 2016, 'epoch': 2} {'type': 'loss', 'content': 0.265523761510849, 'timestamp': '2025-09-05 09:04:49.800983', 'step': 2017, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:50.008530', 'step': 2017, 'epoch': 2} {'type': 'loss', 'content': 0.4000529944896698, 'timestamp': '2025-09-05 09:04:50.010746', 'step': 2018, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:50.178544', 'step': 2018, 'epoch': 2} {'type': 'loss', 'content': 0.24551428854465485, 'timestamp': '2025-09-05 09:04:50.180673', 'step': 2019, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:50.380479', 'step': 2019, 'epoch': 2} {'type': 'loss', 'content': 0.15637749433517456, 'timestamp': '2025-09-05 09:04:50.390664', 'step': 2020, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:04:55.197240', 'step': 2020, 'epoch': 2} {'type': 'pplx', 'content': 57.22559411581902, 'timestamp': '2025-09-05 09:04:55.199506', 'step': 2020, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:55.360725', 'step': 2020, 'epoch': 2} {'type': 'loss', 'content': 0.29269856214523315, 'timestamp': '2025-09-05 09:04:55.362490', 'step': 2021, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:55.532762', 'step': 2021, 'epoch': 2} {'type': 'loss', 'content': 0.40585339069366455, 'timestamp': '2025-09-05 09:04:55.535125', 'step': 2022, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:55.741574', 'step': 2022, 'epoch': 2} {'type': 'loss', 'content': 0.4142700135707855, 'timestamp': '2025-09-05 09:04:55.743607', 'step': 2023, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:55.911896', 'step': 2023, 'epoch': 2} {'type': 'loss', 'content': 0.3398078680038452, 'timestamp': '2025-09-05 09:04:55.927449', 'step': 2024, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:56.115575', 'step': 2024, 'epoch': 2} {'type': 'loss', 'content': 0.27966639399528503, 'timestamp': '2025-09-05 09:04:56.117340', 'step': 2025, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:56.283716', 'step': 2025, 'epoch': 2} {'type': 'loss', 'content': 0.3459966480731964, 'timestamp': '2025-09-05 09:04:56.285535', 'step': 2026, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:56.479497', 'step': 2026, 'epoch': 2} {'type': 'loss', 'content': 0.26111602783203125, 'timestamp': '2025-09-05 09:04:56.481512', 'step': 2027, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:56.647710', 'step': 2027, 'epoch': 2} {'type': 'loss', 'content': 0.2696475386619568, 'timestamp': '2025-09-05 09:04:56.663724', 'step': 2028, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:56.855157', 'step': 2028, 'epoch': 2} {'type': 'loss', 'content': 0.44315868616104126, 'timestamp': '2025-09-05 09:04:56.858681', 'step': 2029, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:57.029209', 'step': 2029, 'epoch': 2} {'type': 'loss', 'content': 0.37835097312927246, 'timestamp': '2025-09-05 09:04:57.031189', 'step': 2030, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:04:57.198452', 'step': 2030, 'epoch': 2} {'type': 'loss', 'content': 0.4174140989780426, 'timestamp': '2025-09-05 09:04:57.200571', 'step': 2031, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:57.397154', 'step': 2031, 'epoch': 2} {'type': 'loss', 'content': 0.3270459473133087, 'timestamp': '2025-09-05 09:04:57.407649', 'step': 2032, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:57.569889', 'step': 2032, 'epoch': 2} {'type': 'loss', 'content': 0.22918665409088135, 'timestamp': '2025-09-05 09:04:57.572029', 'step': 2033, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:57.778081', 'step': 2033, 'epoch': 2} {'type': 'loss', 'content': 0.37719905376434326, 'timestamp': '2025-09-05 09:04:57.780047', 'step': 2034, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:04:57.985555', 'step': 2034, 'epoch': 2} {'type': 'loss', 'content': 0.32334667444229126, 'timestamp': '2025-09-05 09:04:57.989649', 'step': 2035, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:58.201644', 'step': 2035, 'epoch': 2} {'type': 'loss', 'content': 0.343068391084671, 'timestamp': '2025-09-05 09:04:58.259790', 'step': 2036, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:04:58.509098', 'step': 2036, 'epoch': 2} {'type': 'loss', 'content': 0.4084326922893524, 'timestamp': '2025-09-05 09:04:58.511936', 'step': 2037, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:04:58.717269', 'step': 2037, 'epoch': 2} {'type': 'loss', 'content': 0.24380765855312347, 'timestamp': '2025-09-05 09:04:58.720014', 'step': 2038, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:04:58.927965', 'step': 2038, 'epoch': 2} {'type': 'loss', 'content': 0.21494068205356598, 'timestamp': '2025-09-05 09:04:58.930477', 'step': 2039, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:04:59.102037', 'step': 2039, 'epoch': 2} {'type': 'loss', 'content': 0.24519406259059906, 'timestamp': '2025-09-05 09:04:59.112573', 'step': 2040, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:05:03.749817', 'step': 2040, 'epoch': 2} {'type': 'pplx', 'content': 56.85761600576899, 'timestamp': '2025-09-05 09:05:03.751727', 'step': 2040, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2040', 'timestamp': '2025-09-05 09:05:04.212748', 'step': 2040, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:04.379772', 'step': 2040, 'epoch': 2} {'type': 'loss', 'content': 0.21963070333003998, 'timestamp': '2025-09-05 09:05:04.381772', 'step': 2041, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:04.576449', 'step': 2041, 'epoch': 2} {'type': 'loss', 'content': 0.2468557208776474, 'timestamp': '2025-09-05 09:05:04.580133', 'step': 2042, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:04.775998', 'step': 2042, 'epoch': 2} {'type': 'loss', 'content': 0.38554346561431885, 'timestamp': '2025-09-05 09:05:04.778595', 'step': 2043, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:05:04.948336', 'step': 2043, 'epoch': 2} {'type': 'loss', 'content': 0.2700977325439453, 'timestamp': '2025-09-05 09:05:04.965970', 'step': 2044, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:05:05.163837', 'step': 2044, 'epoch': 2} {'type': 'loss', 'content': 0.32830366492271423, 'timestamp': '2025-09-05 09:05:05.167027', 'step': 2045, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:05.337485', 'step': 2045, 'epoch': 2} {'type': 'loss', 'content': 0.41252601146698, 'timestamp': '2025-09-05 09:05:05.339559', 'step': 2046, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:05.506711', 'step': 2046, 'epoch': 2} {'type': 'loss', 'content': 0.23501189053058624, 'timestamp': '2025-09-05 09:05:05.509934', 'step': 2047, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:05.676582', 'step': 2047, 'epoch': 2} {'type': 'loss', 'content': 0.406737744808197, 'timestamp': '2025-09-05 09:05:05.686266', 'step': 2048, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:05.850915', 'step': 2048, 'epoch': 2} {'type': 'loss', 'content': 0.3673907220363617, 'timestamp': '2025-09-05 09:05:05.852489', 'step': 2049, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:06.047152', 'step': 2049, 'epoch': 2} {'type': 'loss', 'content': 0.3338184058666229, 'timestamp': '2025-09-05 09:05:06.050231', 'step': 2050, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:06.254566', 'step': 2050, 'epoch': 2} {'type': 'loss', 'content': 0.3576391935348511, 'timestamp': '2025-09-05 09:05:06.257351', 'step': 2051, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:06.456315', 'step': 2051, 'epoch': 2} {'type': 'loss', 'content': 0.4063916504383087, 'timestamp': '2025-09-05 09:05:06.465686', 'step': 2052, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:05:06.630051', 'step': 2052, 'epoch': 2} {'type': 'loss', 'content': 0.30287304520606995, 'timestamp': '2025-09-05 09:05:06.632774', 'step': 2053, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:06.801724', 'step': 2053, 'epoch': 2} {'type': 'loss', 'content': 0.31338661909103394, 'timestamp': '2025-09-05 09:05:06.804131', 'step': 2054, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:07.000461', 'step': 2054, 'epoch': 2} {'type': 'loss', 'content': 0.30023157596588135, 'timestamp': '2025-09-05 09:05:07.002435', 'step': 2055, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:07.169863', 'step': 2055, 'epoch': 2} {'type': 'loss', 'content': 0.5755932927131653, 'timestamp': '2025-09-05 09:05:07.187296', 'step': 2056, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:05:07.385750', 'step': 2056, 'epoch': 2} {'type': 'loss', 'content': 0.44111013412475586, 'timestamp': '2025-09-05 09:05:07.389213', 'step': 2057, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:07.596791', 'step': 2057, 'epoch': 2} {'type': 'loss', 'content': 0.3329983651638031, 'timestamp': '2025-09-05 09:05:07.598769', 'step': 2058, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:07.794782', 'step': 2058, 'epoch': 2} {'type': 'loss', 'content': 0.24375692009925842, 'timestamp': '2025-09-05 09:05:07.797646', 'step': 2059, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:07.965368', 'step': 2059, 'epoch': 2} {'type': 'loss', 'content': 0.27790704369544983, 'timestamp': '2025-09-05 09:05:07.982541', 'step': 2060, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:05:12.660572', 'step': 2060, 'epoch': 2} {'type': 'pplx', 'content': 56.09090719183764, 'timestamp': '2025-09-05 09:05:12.662701', 'step': 2060, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:12.824471', 'step': 2060, 'epoch': 2} {'type': 'loss', 'content': 0.2383831888437271, 'timestamp': '2025-09-05 09:05:12.826666', 'step': 2061, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:12.993388', 'step': 2061, 'epoch': 2} {'type': 'loss', 'content': 0.3052305579185486, 'timestamp': '2025-09-05 09:05:12.995388', 'step': 2062, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:13.161817', 'step': 2062, 'epoch': 2} {'type': 'loss', 'content': 0.18823660910129547, 'timestamp': '2025-09-05 09:05:13.166329', 'step': 2063, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:13.365333', 'step': 2063, 'epoch': 2} {'type': 'loss', 'content': 0.28078800439834595, 'timestamp': '2025-09-05 09:05:13.422659', 'step': 2064, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:13.620690', 'step': 2064, 'epoch': 2} {'type': 'loss', 'content': 0.42778801918029785, 'timestamp': '2025-09-05 09:05:13.622539', 'step': 2065, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:13.818849', 'step': 2065, 'epoch': 2} {'type': 'loss', 'content': 0.3101421594619751, 'timestamp': '2025-09-05 09:05:13.821061', 'step': 2066, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:14.017518', 'step': 2066, 'epoch': 2} {'type': 'loss', 'content': 0.27489471435546875, 'timestamp': '2025-09-05 09:05:14.020061', 'step': 2067, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:14.228453', 'step': 2067, 'epoch': 2} {'type': 'loss', 'content': 0.23832768201828003, 'timestamp': '2025-09-05 09:05:14.238406', 'step': 2068, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:14.403227', 'step': 2068, 'epoch': 2} {'type': 'loss', 'content': 0.3375934064388275, 'timestamp': '2025-09-05 09:05:14.405573', 'step': 2069, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:14.610505', 'step': 2069, 'epoch': 2} {'type': 'loss', 'content': 0.3630978763103485, 'timestamp': '2025-09-05 09:05:14.612484', 'step': 2070, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:05:14.779489', 'step': 2070, 'epoch': 2} {'type': 'loss', 'content': 0.3400574326515198, 'timestamp': '2025-09-05 09:05:14.781916', 'step': 2071, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:14.980142', 'step': 2071, 'epoch': 2} {'type': 'loss', 'content': 0.40873268246650696, 'timestamp': '2025-09-05 09:05:14.994736', 'step': 2072, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:05:15.184588', 'step': 2072, 'epoch': 2} {'type': 'loss', 'content': 0.3188339173793793, 'timestamp': '2025-09-05 09:05:15.187393', 'step': 2073, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:15.356461', 'step': 2073, 'epoch': 2} {'type': 'loss', 'content': 0.2616739869117737, 'timestamp': '2025-09-05 09:05:15.358806', 'step': 2074, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:15.556729', 'step': 2074, 'epoch': 2} {'type': 'loss', 'content': 0.38481009006500244, 'timestamp': '2025-09-05 09:05:15.558582', 'step': 2075, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:15.727982', 'step': 2075, 'epoch': 2} {'type': 'loss', 'content': 0.3673509657382965, 'timestamp': '2025-09-05 09:05:15.737912', 'step': 2076, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:15.904494', 'step': 2076, 'epoch': 2} {'type': 'loss', 'content': 0.34988126158714294, 'timestamp': '2025-09-05 09:05:15.906553', 'step': 2077, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:16.111211', 'step': 2077, 'epoch': 2} {'type': 'loss', 'content': 0.262952983379364, 'timestamp': '2025-09-05 09:05:16.113704', 'step': 2078, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:16.280634', 'step': 2078, 'epoch': 2} {'type': 'loss', 'content': 0.33671560883522034, 'timestamp': '2025-09-05 09:05:16.283544', 'step': 2079, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:16.478700', 'step': 2079, 'epoch': 2} {'type': 'loss', 'content': 0.35512277483940125, 'timestamp': '2025-09-05 09:05:16.488316', 'step': 2080, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:05:21.153968', 'step': 2080, 'epoch': 2} {'type': 'pplx', 'content': 56.30905831955004, 'timestamp': '2025-09-05 09:05:21.155970', 'step': 2080, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2080', 'timestamp': '2025-09-05 09:05:21.636265', 'step': 2080, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:21.805059', 'step': 2080, 'epoch': 2} {'type': 'loss', 'content': 0.3978932201862335, 'timestamp': '2025-09-05 09:05:21.806980', 'step': 2081, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:21.973582', 'step': 2081, 'epoch': 2} {'type': 'loss', 'content': 0.23755325376987457, 'timestamp': '2025-09-05 09:05:21.975776', 'step': 2082, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:22.181802', 'step': 2082, 'epoch': 2} {'type': 'loss', 'content': 0.26282647252082825, 'timestamp': '2025-09-05 09:05:22.184030', 'step': 2083, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:22.390211', 'step': 2083, 'epoch': 2} {'type': 'loss', 'content': 0.22978627681732178, 'timestamp': '2025-09-05 09:05:22.399814', 'step': 2084, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:22.564860', 'step': 2084, 'epoch': 2} {'type': 'loss', 'content': 0.34872758388519287, 'timestamp': '2025-09-05 09:05:22.566428', 'step': 2085, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:22.733852', 'step': 2085, 'epoch': 2} {'type': 'loss', 'content': 0.26038891077041626, 'timestamp': '2025-09-05 09:05:22.735811', 'step': 2086, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:22.941243', 'step': 2086, 'epoch': 2} {'type': 'loss', 'content': 0.2772735357284546, 'timestamp': '2025-09-05 09:05:22.943426', 'step': 2087, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:23.171037', 'step': 2087, 'epoch': 2} {'type': 'loss', 'content': 0.3665330111980438, 'timestamp': '2025-09-05 09:05:23.185480', 'step': 2088, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:05:23.377188', 'step': 2088, 'epoch': 2} {'type': 'loss', 'content': 0.30773794651031494, 'timestamp': '2025-09-05 09:05:23.378924', 'step': 2089, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:23.547670', 'step': 2089, 'epoch': 2} {'type': 'loss', 'content': 0.3149157464504242, 'timestamp': '2025-09-05 09:05:23.549514', 'step': 2090, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:23.716414', 'step': 2090, 'epoch': 2} {'type': 'loss', 'content': 0.26793622970581055, 'timestamp': '2025-09-05 09:05:23.718590', 'step': 2091, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:23.915903', 'step': 2091, 'epoch': 2} {'type': 'loss', 'content': 0.23315511643886566, 'timestamp': '2025-09-05 09:05:23.930531', 'step': 2092, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:24.126973', 'step': 2092, 'epoch': 2} {'type': 'loss', 'content': 0.33972030878067017, 'timestamp': '2025-09-05 09:05:24.129837', 'step': 2093, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:24.328693', 'step': 2093, 'epoch': 2} {'type': 'loss', 'content': 0.24864479899406433, 'timestamp': '2025-09-05 09:05:24.332201', 'step': 2094, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:24.530634', 'step': 2094, 'epoch': 2} {'type': 'loss', 'content': 0.4089866876602173, 'timestamp': '2025-09-05 09:05:24.532314', 'step': 2095, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:24.700655', 'step': 2095, 'epoch': 2} {'type': 'loss', 'content': 0.36251306533813477, 'timestamp': '2025-09-05 09:05:24.710674', 'step': 2096, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:05:24.877319', 'step': 2096, 'epoch': 2} {'type': 'loss', 'content': 0.36653363704681396, 'timestamp': '2025-09-05 09:05:24.880792', 'step': 2097, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:25.090908', 'step': 2097, 'epoch': 2} {'type': 'loss', 'content': 0.2979757487773895, 'timestamp': '2025-09-05 09:05:25.093136', 'step': 2098, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:25.264142', 'step': 2098, 'epoch': 2} {'type': 'loss', 'content': 0.3006058633327484, 'timestamp': '2025-09-05 09:05:25.266754', 'step': 2099, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:25.476399', 'step': 2099, 'epoch': 2} {'type': 'loss', 'content': 0.3439769148826599, 'timestamp': '2025-09-05 09:05:25.493166', 'step': 2100, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:05:30.301851', 'step': 2100, 'epoch': 2} {'type': 'pplx', 'content': 55.87515440553129, 'timestamp': '2025-09-05 09:05:30.304097', 'step': 2100, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:05:30.467899', 'step': 2100, 'epoch': 2} {'type': 'loss', 'content': 0.3255309760570526, 'timestamp': '2025-09-05 09:05:30.469795', 'step': 2101, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:30.635767', 'step': 2101, 'epoch': 2} {'type': 'loss', 'content': 0.23818431794643402, 'timestamp': '2025-09-05 09:05:30.637417', 'step': 2102, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:30.804649', 'step': 2102, 'epoch': 2} {'type': 'loss', 'content': 0.2935701310634613, 'timestamp': '2025-09-05 09:05:30.806664', 'step': 2103, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:31.003143', 'step': 2103, 'epoch': 2} {'type': 'loss', 'content': 0.3238224983215332, 'timestamp': '2025-09-05 09:05:31.017915', 'step': 2104, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:31.207656', 'step': 2104, 'epoch': 2} {'type': 'loss', 'content': 0.26048019528388977, 'timestamp': '2025-09-05 09:05:31.209412', 'step': 2105, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:31.417385', 'step': 2105, 'epoch': 2} {'type': 'loss', 'content': 0.3144626319408417, 'timestamp': '2025-09-05 09:05:31.419292', 'step': 2106, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:31.615226', 'step': 2106, 'epoch': 2} {'type': 'loss', 'content': 0.31784558296203613, 'timestamp': '2025-09-05 09:05:31.617380', 'step': 2107, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:31.782250', 'step': 2107, 'epoch': 2} {'type': 'loss', 'content': 0.3896116018295288, 'timestamp': '2025-09-05 09:05:31.799351', 'step': 2108, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:05:31.999348', 'step': 2108, 'epoch': 2} {'type': 'loss', 'content': 0.5450838208198547, 'timestamp': '2025-09-05 09:05:32.001492', 'step': 2109, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:32.207332', 'step': 2109, 'epoch': 2} {'type': 'loss', 'content': 0.4268830716609955, 'timestamp': '2025-09-05 09:05:32.209190', 'step': 2110, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:32.376126', 'step': 2110, 'epoch': 2} {'type': 'loss', 'content': 0.2802806496620178, 'timestamp': '2025-09-05 09:05:32.379085', 'step': 2111, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:32.582723', 'step': 2111, 'epoch': 2} {'type': 'loss', 'content': 0.3529273271560669, 'timestamp': '2025-09-05 09:05:32.597710', 'step': 2112, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:32.786023', 'step': 2112, 'epoch': 2} {'type': 'loss', 'content': 0.37120527029037476, 'timestamp': '2025-09-05 09:05:32.788315', 'step': 2113, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:32.987392', 'step': 2113, 'epoch': 2} {'type': 'loss', 'content': 0.2828807532787323, 'timestamp': '2025-09-05 09:05:32.989258', 'step': 2114, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:33.197110', 'step': 2114, 'epoch': 2} {'type': 'loss', 'content': 0.3188009262084961, 'timestamp': '2025-09-05 09:05:33.198841', 'step': 2115, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:33.397741', 'step': 2115, 'epoch': 2} {'type': 'loss', 'content': 0.30147457122802734, 'timestamp': '2025-09-05 09:05:33.414853', 'step': 2116, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:33.606839', 'step': 2116, 'epoch': 2} {'type': 'loss', 'content': 0.39994457364082336, 'timestamp': '2025-09-05 09:05:33.609368', 'step': 2117, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:33.814464', 'step': 2117, 'epoch': 2} {'type': 'loss', 'content': 0.29162168502807617, 'timestamp': '2025-09-05 09:05:33.816904', 'step': 2118, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:34.012704', 'step': 2118, 'epoch': 2} {'type': 'loss', 'content': 0.36326169967651367, 'timestamp': '2025-09-05 09:05:34.015173', 'step': 2119, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:34.220510', 'step': 2119, 'epoch': 2} {'type': 'loss', 'content': 0.39839819073677063, 'timestamp': '2025-09-05 09:05:34.238310', 'step': 2120, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:05:38.962153', 'step': 2120, 'epoch': 2} {'type': 'pplx', 'content': 55.023435362113, 'timestamp': '2025-09-05 09:05:38.964199', 'step': 2120, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2120', 'timestamp': '2025-09-05 09:05:39.426004', 'step': 2120, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:39.590655', 'step': 2120, 'epoch': 2} {'type': 'loss', 'content': 0.298641175031662, 'timestamp': '2025-09-05 09:05:39.592486', 'step': 2121, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:39.762317', 'step': 2121, 'epoch': 2} {'type': 'loss', 'content': 0.2771815061569214, 'timestamp': '2025-09-05 09:05:39.764288', 'step': 2122, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:39.970270', 'step': 2122, 'epoch': 2} {'type': 'loss', 'content': 0.3446735143661499, 'timestamp': '2025-09-05 09:05:39.973486', 'step': 2123, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:40.142076', 'step': 2123, 'epoch': 2} {'type': 'loss', 'content': 0.32573774456977844, 'timestamp': '2025-09-05 09:05:40.158806', 'step': 2124, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:05:40.355566', 'step': 2124, 'epoch': 2} {'type': 'loss', 'content': 0.3329891264438629, 'timestamp': '2025-09-05 09:05:40.357398', 'step': 2125, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:40.524554', 'step': 2125, 'epoch': 2} {'type': 'loss', 'content': 0.3262787163257599, 'timestamp': '2025-09-05 09:05:40.526910', 'step': 2126, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:05:40.724145', 'step': 2126, 'epoch': 2} {'type': 'loss', 'content': 0.28894031047821045, 'timestamp': '2025-09-05 09:05:40.726046', 'step': 2127, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:40.894443', 'step': 2127, 'epoch': 2} {'type': 'loss', 'content': 0.37630510330200195, 'timestamp': '2025-09-05 09:05:40.908812', 'step': 2128, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:41.096804', 'step': 2128, 'epoch': 2} {'type': 'loss', 'content': 0.31172171235084534, 'timestamp': '2025-09-05 09:05:41.098750', 'step': 2129, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:41.294440', 'step': 2129, 'epoch': 2} {'type': 'loss', 'content': 0.3221554756164551, 'timestamp': '2025-09-05 09:05:41.296586', 'step': 2130, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:41.464390', 'step': 2130, 'epoch': 2} {'type': 'loss', 'content': 0.22313398122787476, 'timestamp': '2025-09-05 09:05:41.466652', 'step': 2131, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:41.671802', 'step': 2131, 'epoch': 2} {'type': 'loss', 'content': 0.27833616733551025, 'timestamp': '2025-09-05 09:05:41.688374', 'step': 2132, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:41.888539', 'step': 2132, 'epoch': 2} {'type': 'loss', 'content': 0.35830333828926086, 'timestamp': '2025-09-05 09:05:41.890675', 'step': 2133, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:42.057035', 'step': 2133, 'epoch': 2} {'type': 'loss', 'content': 0.3745828866958618, 'timestamp': '2025-09-05 09:05:42.059027', 'step': 2134, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:42.256318', 'step': 2134, 'epoch': 2} {'type': 'loss', 'content': 0.4300999939441681, 'timestamp': '2025-09-05 09:05:42.258212', 'step': 2135, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:42.425915', 'step': 2135, 'epoch': 2} {'type': 'loss', 'content': 0.4072500169277191, 'timestamp': '2025-09-05 09:05:42.440937', 'step': 2136, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:42.629269', 'step': 2136, 'epoch': 2} {'type': 'loss', 'content': 0.2526228129863739, 'timestamp': '2025-09-05 09:05:42.631217', 'step': 2137, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:42.837475', 'step': 2137, 'epoch': 2} {'type': 'loss', 'content': 0.39318081736564636, 'timestamp': '2025-09-05 09:05:42.839334', 'step': 2138, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:43.005908', 'step': 2138, 'epoch': 2} {'type': 'loss', 'content': 0.2297356128692627, 'timestamp': '2025-09-05 09:05:43.008470', 'step': 2139, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:05:43.206987', 'step': 2139, 'epoch': 2} {'type': 'loss', 'content': 0.33987030386924744, 'timestamp': '2025-09-05 09:05:43.216353', 'step': 2140, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:05:47.865119', 'step': 2140, 'epoch': 2} {'type': 'pplx', 'content': 55.06809153484284, 'timestamp': '2025-09-05 09:05:47.867422', 'step': 2140, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:48.032484', 'step': 2140, 'epoch': 2} {'type': 'loss', 'content': 0.4749826192855835, 'timestamp': '2025-09-05 09:05:48.034336', 'step': 2141, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:48.200139', 'step': 2141, 'epoch': 2} {'type': 'loss', 'content': 0.28325966000556946, 'timestamp': '2025-09-05 09:05:48.201779', 'step': 2142, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:48.369121', 'step': 2142, 'epoch': 2} {'type': 'loss', 'content': 0.42103084921836853, 'timestamp': '2025-09-05 09:05:48.370850', 'step': 2143, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:48.566526', 'step': 2143, 'epoch': 2} {'type': 'loss', 'content': 0.38202551007270813, 'timestamp': '2025-09-05 09:05:48.576121', 'step': 2144, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:48.738085', 'step': 2144, 'epoch': 2} {'type': 'loss', 'content': 0.3595607578754425, 'timestamp': '2025-09-05 09:05:48.740049', 'step': 2145, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:05:48.947701', 'step': 2145, 'epoch': 2} {'type': 'loss', 'content': 0.3413144648075104, 'timestamp': '2025-09-05 09:05:48.949394', 'step': 2146, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:49.156351', 'step': 2146, 'epoch': 2} {'type': 'loss', 'content': 0.24180643260478973, 'timestamp': '2025-09-05 09:05:49.157912', 'step': 2147, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:49.325916', 'step': 2147, 'epoch': 2} {'type': 'loss', 'content': 0.2788175344467163, 'timestamp': '2025-09-05 09:05:49.342534', 'step': 2148, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:49.540402', 'step': 2148, 'epoch': 2} {'type': 'loss', 'content': 0.39272475242614746, 'timestamp': '2025-09-05 09:05:49.542346', 'step': 2149, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:49.709083', 'step': 2149, 'epoch': 2} {'type': 'loss', 'content': 0.2623364329338074, 'timestamp': '2025-09-05 09:05:49.711039', 'step': 2150, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:49.919434', 'step': 2150, 'epoch': 2} {'type': 'loss', 'content': 0.21915015578269958, 'timestamp': '2025-09-05 09:05:49.921268', 'step': 2151, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:50.088631', 'step': 2151, 'epoch': 2} {'type': 'loss', 'content': 0.38461872935295105, 'timestamp': '2025-09-05 09:05:50.105713', 'step': 2152, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:50.302548', 'step': 2152, 'epoch': 2} {'type': 'loss', 'content': 0.4068615138530731, 'timestamp': '2025-09-05 09:05:50.304488', 'step': 2153, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:50.472419', 'step': 2153, 'epoch': 2} {'type': 'loss', 'content': 0.3176921606063843, 'timestamp': '2025-09-05 09:05:50.474129', 'step': 2154, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:50.681724', 'step': 2154, 'epoch': 2} {'type': 'loss', 'content': 0.34700101613998413, 'timestamp': '2025-09-05 09:05:50.683454', 'step': 2155, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:50.851655', 'step': 2155, 'epoch': 2} {'type': 'loss', 'content': 0.3761594891548157, 'timestamp': '2025-09-05 09:05:50.869173', 'step': 2156, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:05:51.068367', 'step': 2156, 'epoch': 2} {'type': 'loss', 'content': 0.1952638030052185, 'timestamp': '2025-09-05 09:05:51.070604', 'step': 2157, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:05:51.320283', 'step': 2157, 'epoch': 2} {'type': 'loss', 'content': 0.4499374330043793, 'timestamp': '2025-09-05 09:05:51.322342', 'step': 2158, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:51.521209', 'step': 2158, 'epoch': 2} {'type': 'loss', 'content': 0.2834685146808624, 'timestamp': '2025-09-05 09:05:51.523072', 'step': 2159, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:51.731601', 'step': 2159, 'epoch': 2} {'type': 'loss', 'content': 0.27654868364334106, 'timestamp': '2025-09-05 09:05:51.745727', 'step': 2160, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:05:57.239086', 'step': 2160, 'epoch': 2} {'type': 'pplx', 'content': 55.48316011694305, 'timestamp': '2025-09-05 09:05:57.242217', 'step': 2160, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2160', 'timestamp': '2025-09-05 09:05:57.693603', 'step': 2160, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:57.858500', 'step': 2160, 'epoch': 2} {'type': 'loss', 'content': 0.35953032970428467, 'timestamp': '2025-09-05 09:05:57.860617', 'step': 2161, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:58.029205', 'step': 2161, 'epoch': 2} {'type': 'loss', 'content': 0.4004409611225128, 'timestamp': '2025-09-05 09:05:58.030979', 'step': 2162, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:58.198364', 'step': 2162, 'epoch': 2} {'type': 'loss', 'content': 0.45261430740356445, 'timestamp': '2025-09-05 09:05:58.200549', 'step': 2163, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:58.367098', 'step': 2163, 'epoch': 2} {'type': 'loss', 'content': 0.1983097940683365, 'timestamp': '2025-09-05 09:05:58.382083', 'step': 2164, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:58.571827', 'step': 2164, 'epoch': 2} {'type': 'loss', 'content': 0.2994425594806671, 'timestamp': '2025-09-05 09:05:58.573488', 'step': 2165, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:05:58.743371', 'step': 2165, 'epoch': 2} {'type': 'loss', 'content': 0.3872717618942261, 'timestamp': '2025-09-05 09:05:58.745170', 'step': 2166, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:58.953533', 'step': 2166, 'epoch': 2} {'type': 'loss', 'content': 0.2745550572872162, 'timestamp': '2025-09-05 09:05:58.955400', 'step': 2167, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:05:59.164803', 'step': 2167, 'epoch': 2} {'type': 'loss', 'content': 0.3339973986148834, 'timestamp': '2025-09-05 09:05:59.180082', 'step': 2168, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:05:59.375968', 'step': 2168, 'epoch': 2} {'type': 'loss', 'content': 0.4272925555706024, 'timestamp': '2025-09-05 09:05:59.378179', 'step': 2169, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:59.586272', 'step': 2169, 'epoch': 2} {'type': 'loss', 'content': 0.2760466933250427, 'timestamp': '2025-09-05 09:05:59.588264', 'step': 2170, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:59.795018', 'step': 2170, 'epoch': 2} {'type': 'loss', 'content': 0.25868964195251465, 'timestamp': '2025-09-05 09:05:59.796794', 'step': 2171, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:05:59.992597', 'step': 2171, 'epoch': 2} {'type': 'loss', 'content': 0.2131020873785019, 'timestamp': '2025-09-05 09:06:00.009053', 'step': 2172, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:00.205797', 'step': 2172, 'epoch': 2} {'type': 'loss', 'content': 0.3480170667171478, 'timestamp': '2025-09-05 09:06:00.207987', 'step': 2173, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:00.375416', 'step': 2173, 'epoch': 2} {'type': 'loss', 'content': 0.38949131965637207, 'timestamp': '2025-09-05 09:06:00.377072', 'step': 2174, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:06:00.582310', 'step': 2174, 'epoch': 2} {'type': 'loss', 'content': 0.3337663412094116, 'timestamp': '2025-09-05 09:06:00.584027', 'step': 2175, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:00.780812', 'step': 2175, 'epoch': 2} {'type': 'loss', 'content': 0.48506397008895874, 'timestamp': '2025-09-05 09:06:00.796080', 'step': 2176, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:00.988006', 'step': 2176, 'epoch': 2} {'type': 'loss', 'content': 0.34281837940216064, 'timestamp': '2025-09-05 09:06:00.989875', 'step': 2177, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:01.188090', 'step': 2177, 'epoch': 2} {'type': 'loss', 'content': 0.2596714496612549, 'timestamp': '2025-09-05 09:06:01.189855', 'step': 2178, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:01.360399', 'step': 2178, 'epoch': 2} {'type': 'loss', 'content': 0.20409443974494934, 'timestamp': '2025-09-05 09:06:01.362078', 'step': 2179, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:01.567175', 'step': 2179, 'epoch': 2} {'type': 'loss', 'content': 0.34518057107925415, 'timestamp': '2025-09-05 09:06:01.581103', 'step': 2180, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:06:06.460758', 'step': 2180, 'epoch': 2} {'type': 'pplx', 'content': 55.87232859467735, 'timestamp': '2025-09-05 09:06:06.462417', 'step': 2180, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:06:06.626251', 'step': 2180, 'epoch': 2} {'type': 'loss', 'content': 0.3369835317134857, 'timestamp': '2025-09-05 09:06:06.628361', 'step': 2181, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:06:06.797342', 'step': 2181, 'epoch': 2} {'type': 'loss', 'content': 0.3712189793586731, 'timestamp': '2025-09-05 09:06:06.798930', 'step': 2182, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:06.969597', 'step': 2182, 'epoch': 2} {'type': 'loss', 'content': 0.3187856376171112, 'timestamp': '2025-09-05 09:06:06.972251', 'step': 2183, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:07.140449', 'step': 2183, 'epoch': 2} {'type': 'loss', 'content': 0.20823392271995544, 'timestamp': '2025-09-05 09:06:07.158686', 'step': 2184, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:06:07.365847', 'step': 2184, 'epoch': 2} {'type': 'loss', 'content': 0.3842635154724121, 'timestamp': '2025-09-05 09:06:07.372022', 'step': 2185, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:07.583304', 'step': 2185, 'epoch': 2} {'type': 'loss', 'content': 0.24682661890983582, 'timestamp': '2025-09-05 09:06:07.586882', 'step': 2186, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:07.785086', 'step': 2186, 'epoch': 2} {'type': 'loss', 'content': 0.32766950130462646, 'timestamp': '2025-09-05 09:06:07.788027', 'step': 2187, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:07.989795', 'step': 2187, 'epoch': 2} {'type': 'loss', 'content': 0.49860307574272156, 'timestamp': '2025-09-05 09:06:08.007831', 'step': 2188, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:08.211143', 'step': 2188, 'epoch': 2} {'type': 'loss', 'content': 0.4436277449131012, 'timestamp': '2025-09-05 09:06:08.213455', 'step': 2189, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:08.420670', 'step': 2189, 'epoch': 2} {'type': 'loss', 'content': 0.24359887838363647, 'timestamp': '2025-09-05 09:06:08.423027', 'step': 2190, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:08.629797', 'step': 2190, 'epoch': 2} {'type': 'loss', 'content': 0.2521995007991791, 'timestamp': '2025-09-05 09:06:08.632822', 'step': 2191, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:08.799043', 'step': 2191, 'epoch': 2} {'type': 'loss', 'content': 0.4108291566371918, 'timestamp': '2025-09-05 09:06:08.814395', 'step': 2192, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:09.001706', 'step': 2192, 'epoch': 2} {'type': 'loss', 'content': 0.4450414478778839, 'timestamp': '2025-09-05 09:06:09.004367', 'step': 2193, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:09.174249', 'step': 2193, 'epoch': 2} {'type': 'loss', 'content': 0.3344271779060364, 'timestamp': '2025-09-05 09:06:09.176943', 'step': 2194, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:06:09.385077', 'step': 2194, 'epoch': 2} {'type': 'loss', 'content': 0.3703814744949341, 'timestamp': '2025-09-05 09:06:09.388491', 'step': 2195, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:09.556016', 'step': 2195, 'epoch': 2} {'type': 'loss', 'content': 0.44833752512931824, 'timestamp': '2025-09-05 09:06:09.566350', 'step': 2196, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:06:09.730307', 'step': 2196, 'epoch': 2} {'type': 'loss', 'content': 0.3722361922264099, 'timestamp': '2025-09-05 09:06:09.732642', 'step': 2197, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:09.938807', 'step': 2197, 'epoch': 2} {'type': 'loss', 'content': 0.43370842933654785, 'timestamp': '2025-09-05 09:06:09.941992', 'step': 2198, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:10.138574', 'step': 2198, 'epoch': 2} {'type': 'loss', 'content': 0.27109774947166443, 'timestamp': '2025-09-05 09:06:10.142016', 'step': 2199, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:10.347324', 'step': 2199, 'epoch': 2} {'type': 'loss', 'content': 0.23769542574882507, 'timestamp': '2025-09-05 09:06:10.365235', 'step': 2200, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:06:15.248799', 'step': 2200, 'epoch': 2} {'type': 'pplx', 'content': 55.81018461535365, 'timestamp': '2025-09-05 09:06:15.251062', 'step': 2200, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2200', 'timestamp': '2025-09-05 09:06:15.710155', 'step': 2200, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:06:15.875503', 'step': 2200, 'epoch': 2} {'type': 'loss', 'content': 0.3465867042541504, 'timestamp': '2025-09-05 09:06:15.877472', 'step': 2201, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:16.045988', 'step': 2201, 'epoch': 2} {'type': 'loss', 'content': 0.28614541888237, 'timestamp': '2025-09-05 09:06:16.047770', 'step': 2202, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:16.215359', 'step': 2202, 'epoch': 2} {'type': 'loss', 'content': 0.2435327023267746, 'timestamp': '2025-09-05 09:06:16.217829', 'step': 2203, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:06:16.412558', 'step': 2203, 'epoch': 2} {'type': 'loss', 'content': 0.28242167830467224, 'timestamp': '2025-09-05 09:06:16.429917', 'step': 2204, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:16.627712', 'step': 2204, 'epoch': 2} {'type': 'loss', 'content': 0.40695592761039734, 'timestamp': '2025-09-05 09:06:16.629514', 'step': 2205, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:06:16.796399', 'step': 2205, 'epoch': 2} {'type': 'loss', 'content': 0.2968638241291046, 'timestamp': '2025-09-05 09:06:16.798607', 'step': 2206, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:16.995784', 'step': 2206, 'epoch': 2} {'type': 'loss', 'content': 0.29859426617622375, 'timestamp': '2025-09-05 09:06:16.997868', 'step': 2207, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:17.204998', 'step': 2207, 'epoch': 2} {'type': 'loss', 'content': 0.2591385841369629, 'timestamp': '2025-09-05 09:06:17.219261', 'step': 2208, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:06:17.409143', 'step': 2208, 'epoch': 2} {'type': 'loss', 'content': 0.3400825262069702, 'timestamp': '2025-09-05 09:06:17.440817', 'step': 2209, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:17.693884', 'step': 2209, 'epoch': 2} {'type': 'loss', 'content': 0.3577132821083069, 'timestamp': '2025-09-05 09:06:17.695615', 'step': 2210, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:17.895401', 'step': 2210, 'epoch': 2} {'type': 'loss', 'content': 0.346355140209198, 'timestamp': '2025-09-05 09:06:17.897200', 'step': 2211, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:06:18.065043', 'step': 2211, 'epoch': 2} {'type': 'loss', 'content': 0.302179217338562, 'timestamp': '2025-09-05 09:06:18.081108', 'step': 2212, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:18.271084', 'step': 2212, 'epoch': 2} {'type': 'loss', 'content': 0.27992647886276245, 'timestamp': '2025-09-05 09:06:18.273478', 'step': 2213, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:18.440962', 'step': 2213, 'epoch': 2} {'type': 'loss', 'content': 0.2065676599740982, 'timestamp': '2025-09-05 09:06:18.443396', 'step': 2214, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:18.655269', 'step': 2214, 'epoch': 2} {'type': 'loss', 'content': 0.28311607241630554, 'timestamp': '2025-09-05 09:06:18.657127', 'step': 2215, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:18.824537', 'step': 2215, 'epoch': 2} {'type': 'loss', 'content': 0.3312028646469116, 'timestamp': '2025-09-05 09:06:18.838942', 'step': 2216, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:19.028366', 'step': 2216, 'epoch': 2} {'type': 'loss', 'content': 0.2649993896484375, 'timestamp': '2025-09-05 09:06:19.030572', 'step': 2217, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:19.239728', 'step': 2217, 'epoch': 2} {'type': 'loss', 'content': 0.2982449233531952, 'timestamp': '2025-09-05 09:06:19.241520', 'step': 2218, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:19.450240', 'step': 2218, 'epoch': 2} {'type': 'loss', 'content': 0.27559399604797363, 'timestamp': '2025-09-05 09:06:19.454792', 'step': 2219, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:19.653922', 'step': 2219, 'epoch': 2} {'type': 'loss', 'content': 0.28188076615333557, 'timestamp': '2025-09-05 09:06:19.671388', 'step': 2220, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:06:24.361464', 'step': 2220, 'epoch': 2} {'type': 'pplx', 'content': 55.25948775718098, 'timestamp': '2025-09-05 09:06:24.364366', 'step': 2220, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:24.528802', 'step': 2220, 'epoch': 2} {'type': 'loss', 'content': 0.28795936703681946, 'timestamp': '2025-09-05 09:06:24.530496', 'step': 2221, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:24.697761', 'step': 2221, 'epoch': 2} {'type': 'loss', 'content': 0.40923991799354553, 'timestamp': '2025-09-05 09:06:24.699852', 'step': 2222, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:24.867605', 'step': 2222, 'epoch': 2} {'type': 'loss', 'content': 0.31736886501312256, 'timestamp': '2025-09-05 09:06:24.869794', 'step': 2223, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:25.036879', 'step': 2223, 'epoch': 2} {'type': 'loss', 'content': 0.2096620500087738, 'timestamp': '2025-09-05 09:06:25.051680', 'step': 2224, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:25.239543', 'step': 2224, 'epoch': 2} {'type': 'loss', 'content': 0.484735906124115, 'timestamp': '2025-09-05 09:06:25.241461', 'step': 2225, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:25.438820', 'step': 2225, 'epoch': 2} {'type': 'loss', 'content': 0.34063124656677246, 'timestamp': '2025-09-05 09:06:25.441718', 'step': 2226, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:25.637874', 'step': 2226, 'epoch': 2} {'type': 'loss', 'content': 0.29765599966049194, 'timestamp': '2025-09-05 09:06:25.639588', 'step': 2227, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:25.805672', 'step': 2227, 'epoch': 2} {'type': 'loss', 'content': 0.24793501198291779, 'timestamp': '2025-09-05 09:06:25.814750', 'step': 2228, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:25.976719', 'step': 2228, 'epoch': 2} {'type': 'loss', 'content': 0.33141300082206726, 'timestamp': '2025-09-05 09:06:25.979149', 'step': 2229, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:26.186014', 'step': 2229, 'epoch': 2} {'type': 'loss', 'content': 0.3666149079799652, 'timestamp': '2025-09-05 09:06:26.188475', 'step': 2230, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:26.475436', 'step': 2230, 'epoch': 2} {'type': 'loss', 'content': 0.23982176184654236, 'timestamp': '2025-09-05 09:06:26.477600', 'step': 2231, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:26.673606', 'step': 2231, 'epoch': 2} {'type': 'loss', 'content': 0.23586024343967438, 'timestamp': '2025-09-05 09:06:26.684063', 'step': 2232, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:26.847693', 'step': 2232, 'epoch': 2} {'type': 'loss', 'content': 0.22031444311141968, 'timestamp': '2025-09-05 09:06:26.849808', 'step': 2233, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:27.016397', 'step': 2233, 'epoch': 2} {'type': 'loss', 'content': 0.2034074366092682, 'timestamp': '2025-09-05 09:06:27.018349', 'step': 2234, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:27.224905', 'step': 2234, 'epoch': 2} {'type': 'loss', 'content': 0.3312498927116394, 'timestamp': '2025-09-05 09:06:27.226712', 'step': 2235, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:27.393791', 'step': 2235, 'epoch': 2} {'type': 'loss', 'content': 0.16999323666095734, 'timestamp': '2025-09-05 09:06:27.410448', 'step': 2236, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:27.606051', 'step': 2236, 'epoch': 2} {'type': 'loss', 'content': 0.262175053358078, 'timestamp': '2025-09-05 09:06:27.607922', 'step': 2237, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:27.804022', 'step': 2237, 'epoch': 2} {'type': 'loss', 'content': 0.3137682378292084, 'timestamp': '2025-09-05 09:06:27.805913', 'step': 2238, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:28.012450', 'step': 2238, 'epoch': 2} {'type': 'loss', 'content': 0.40954217314720154, 'timestamp': '2025-09-05 09:06:28.014922', 'step': 2239, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:28.182501', 'step': 2239, 'epoch': 2} {'type': 'loss', 'content': 0.38842806220054626, 'timestamp': '2025-09-05 09:06:28.199729', 'step': 2240, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:06:33.159795', 'step': 2240, 'epoch': 2} {'type': 'pplx', 'content': 54.81411080714311, 'timestamp': '2025-09-05 09:06:33.161733', 'step': 2240, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2240', 'timestamp': '2025-09-05 09:06:33.625419', 'step': 2240, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:06:33.789231', 'step': 2240, 'epoch': 2} {'type': 'loss', 'content': 0.30577048659324646, 'timestamp': '2025-09-05 09:06:33.791819', 'step': 2241, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:33.959669', 'step': 2241, 'epoch': 2} {'type': 'loss', 'content': 0.2516123950481415, 'timestamp': '2025-09-05 09:06:33.961351', 'step': 2242, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:34.128513', 'step': 2242, 'epoch': 2} {'type': 'loss', 'content': 0.29832541942596436, 'timestamp': '2025-09-05 09:06:34.130926', 'step': 2243, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:34.326975', 'step': 2243, 'epoch': 2} {'type': 'loss', 'content': 0.352728009223938, 'timestamp': '2025-09-05 09:06:34.336301', 'step': 2244, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:34.500913', 'step': 2244, 'epoch': 2} {'type': 'loss', 'content': 0.29814958572387695, 'timestamp': '2025-09-05 09:06:34.503201', 'step': 2245, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:06:34.671218', 'step': 2245, 'epoch': 2} {'type': 'loss', 'content': 0.3409227728843689, 'timestamp': '2025-09-05 09:06:34.673286', 'step': 2246, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:06:34.867144', 'step': 2246, 'epoch': 2} {'type': 'loss', 'content': 0.37319034337997437, 'timestamp': '2025-09-05 09:06:34.869145', 'step': 2247, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:35.035883', 'step': 2247, 'epoch': 2} {'type': 'loss', 'content': 0.378897100687027, 'timestamp': '2025-09-05 09:06:35.050269', 'step': 2248, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:06:35.241028', 'step': 2248, 'epoch': 2} {'type': 'loss', 'content': 0.47297292947769165, 'timestamp': '2025-09-05 09:06:35.243250', 'step': 2249, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:35.448153', 'step': 2249, 'epoch': 2} {'type': 'loss', 'content': 0.2587435245513916, 'timestamp': '2025-09-05 09:06:35.450194', 'step': 2250, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:35.620591', 'step': 2250, 'epoch': 2} {'type': 'loss', 'content': 0.21650661528110504, 'timestamp': '2025-09-05 09:06:35.622564', 'step': 2251, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:35.819398', 'step': 2251, 'epoch': 2} {'type': 'loss', 'content': 0.3861205279827118, 'timestamp': '2025-09-05 09:06:35.834116', 'step': 2252, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:36.023040', 'step': 2252, 'epoch': 2} {'type': 'loss', 'content': 0.24851293861865997, 'timestamp': '2025-09-05 09:06:36.025229', 'step': 2253, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:36.193242', 'step': 2253, 'epoch': 2} {'type': 'loss', 'content': 0.38230419158935547, 'timestamp': '2025-09-05 09:06:36.195357', 'step': 2254, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:06:36.401772', 'step': 2254, 'epoch': 2} {'type': 'loss', 'content': 0.23924830555915833, 'timestamp': '2025-09-05 09:06:36.403897', 'step': 2255, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:36.601332', 'step': 2255, 'epoch': 2} {'type': 'loss', 'content': 0.23087681829929352, 'timestamp': '2025-09-05 09:06:36.616592', 'step': 2256, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:36.803359', 'step': 2256, 'epoch': 2} {'type': 'loss', 'content': 0.31772562861442566, 'timestamp': '2025-09-05 09:06:36.805156', 'step': 2257, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:06:37.000355', 'step': 2257, 'epoch': 2} {'type': 'loss', 'content': 0.37587422132492065, 'timestamp': '2025-09-05 09:06:37.002348', 'step': 2258, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:37.169293', 'step': 2258, 'epoch': 2} {'type': 'loss', 'content': 0.5363959074020386, 'timestamp': '2025-09-05 09:06:37.171488', 'step': 2259, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:06:37.376633', 'step': 2259, 'epoch': 2} {'type': 'loss', 'content': 0.3956094980239868, 'timestamp': '2025-09-05 09:06:37.393078', 'step': 2260, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:06:42.110986', 'step': 2260, 'epoch': 2} {'type': 'pplx', 'content': 54.873000385281806, 'timestamp': '2025-09-05 09:06:42.113444', 'step': 2260, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:42.274354', 'step': 2260, 'epoch': 2} {'type': 'loss', 'content': 0.3519768714904785, 'timestamp': '2025-09-05 09:06:42.276425', 'step': 2261, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:42.526981', 'step': 2261, 'epoch': 2} {'type': 'loss', 'content': 0.2870665192604065, 'timestamp': '2025-09-05 09:06:42.528820', 'step': 2262, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:42.732781', 'step': 2262, 'epoch': 2} {'type': 'loss', 'content': 0.3545304238796234, 'timestamp': '2025-09-05 09:06:42.734675', 'step': 2263, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:42.932972', 'step': 2263, 'epoch': 2} {'type': 'loss', 'content': 0.2984987497329712, 'timestamp': '2025-09-05 09:06:42.943221', 'step': 2264, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:43.105999', 'step': 2264, 'epoch': 2} {'type': 'loss', 'content': 0.3972173035144806, 'timestamp': '2025-09-05 09:06:43.108100', 'step': 2265, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:43.275800', 'step': 2265, 'epoch': 2} {'type': 'loss', 'content': 0.3706391751766205, 'timestamp': '2025-09-05 09:06:43.277701', 'step': 2266, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:43.471167', 'step': 2266, 'epoch': 2} {'type': 'loss', 'content': 0.36558616161346436, 'timestamp': '2025-09-05 09:06:43.473657', 'step': 2267, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:43.641189', 'step': 2267, 'epoch': 2} {'type': 'loss', 'content': 0.2613338232040405, 'timestamp': '2025-09-05 09:06:43.658410', 'step': 2268, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:06:43.856335', 'step': 2268, 'epoch': 2} {'type': 'loss', 'content': 0.3643812835216522, 'timestamp': '2025-09-05 09:06:43.858525', 'step': 2269, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:44.057101', 'step': 2269, 'epoch': 2} {'type': 'loss', 'content': 0.3404228091239929, 'timestamp': '2025-09-05 09:06:44.059038', 'step': 2270, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:44.228781', 'step': 2270, 'epoch': 2} {'type': 'loss', 'content': 0.4255596101284027, 'timestamp': '2025-09-05 09:06:44.230693', 'step': 2271, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:44.399436', 'step': 2271, 'epoch': 2} {'type': 'loss', 'content': 0.3181018829345703, 'timestamp': '2025-09-05 09:06:44.408822', 'step': 2272, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:44.571743', 'step': 2272, 'epoch': 2} {'type': 'loss', 'content': 0.2214895486831665, 'timestamp': '2025-09-05 09:06:44.573526', 'step': 2273, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:44.740584', 'step': 2273, 'epoch': 2} {'type': 'loss', 'content': 0.34708890318870544, 'timestamp': '2025-09-05 09:06:44.742388', 'step': 2274, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:44.938691', 'step': 2274, 'epoch': 2} {'type': 'loss', 'content': 0.39245232939720154, 'timestamp': '2025-09-05 09:06:44.940467', 'step': 2275, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:45.109042', 'step': 2275, 'epoch': 2} {'type': 'loss', 'content': 0.30886292457580566, 'timestamp': '2025-09-05 09:06:45.126949', 'step': 2276, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:45.324958', 'step': 2276, 'epoch': 2} {'type': 'loss', 'content': 0.3094596564769745, 'timestamp': '2025-09-05 09:06:45.327565', 'step': 2277, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:45.532781', 'step': 2277, 'epoch': 2} {'type': 'loss', 'content': 0.3498867452144623, 'timestamp': '2025-09-05 09:06:45.534636', 'step': 2278, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:45.741366', 'step': 2278, 'epoch': 2} {'type': 'loss', 'content': 0.4171161651611328, 'timestamp': '2025-09-05 09:06:45.743093', 'step': 2279, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:45.912226', 'step': 2279, 'epoch': 2} {'type': 'loss', 'content': 0.3185728192329407, 'timestamp': '2025-09-05 09:06:45.927001', 'step': 2280, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:06:50.639787', 'step': 2280, 'epoch': 2} {'type': 'pplx', 'content': 54.187925265235314, 'timestamp': '2025-09-05 09:06:50.641591', 'step': 2280, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2280', 'timestamp': '2025-09-05 09:06:51.097949', 'step': 2280, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:51.263440', 'step': 2280, 'epoch': 2} {'type': 'loss', 'content': 0.31183862686157227, 'timestamp': '2025-09-05 09:06:51.265652', 'step': 2281, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:51.433139', 'step': 2281, 'epoch': 2} {'type': 'loss', 'content': 0.39186206459999084, 'timestamp': '2025-09-05 09:06:51.435416', 'step': 2282, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:51.642913', 'step': 2282, 'epoch': 2} {'type': 'loss', 'content': 0.30460259318351746, 'timestamp': '2025-09-05 09:06:51.644628', 'step': 2283, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:51.811414', 'step': 2283, 'epoch': 2} {'type': 'loss', 'content': 0.3289051055908203, 'timestamp': '2025-09-05 09:06:51.827980', 'step': 2284, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:06:52.021861', 'step': 2284, 'epoch': 2} {'type': 'loss', 'content': 0.3493216335773468, 'timestamp': '2025-09-05 09:06:52.023824', 'step': 2285, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:06:52.188519', 'step': 2285, 'epoch': 2} {'type': 'loss', 'content': 0.1778910756111145, 'timestamp': '2025-09-05 09:06:52.190599', 'step': 2286, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:52.388190', 'step': 2286, 'epoch': 2} {'type': 'loss', 'content': 0.22400565445423126, 'timestamp': '2025-09-05 09:06:52.390250', 'step': 2287, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:06:52.595404', 'step': 2287, 'epoch': 2} {'type': 'loss', 'content': 0.2981906533241272, 'timestamp': '2025-09-05 09:06:52.604955', 'step': 2288, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:06:52.767428', 'step': 2288, 'epoch': 2} {'type': 'loss', 'content': 0.26950085163116455, 'timestamp': '2025-09-05 09:06:52.769577', 'step': 2289, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:06:52.935744', 'step': 2289, 'epoch': 2} {'type': 'loss', 'content': 0.29919737577438354, 'timestamp': '2025-09-05 09:06:52.937652', 'step': 2290, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:53.134942', 'step': 2290, 'epoch': 2} {'type': 'loss', 'content': 0.35156911611557007, 'timestamp': '2025-09-05 09:06:53.136642', 'step': 2291, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:53.303716', 'step': 2291, 'epoch': 2} {'type': 'loss', 'content': 0.376793771982193, 'timestamp': '2025-09-05 09:06:53.319226', 'step': 2292, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:53.514148', 'step': 2292, 'epoch': 2} {'type': 'loss', 'content': 0.38039150834083557, 'timestamp': '2025-09-05 09:06:53.515929', 'step': 2293, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:53.682172', 'step': 2293, 'epoch': 2} {'type': 'loss', 'content': 0.29825207591056824, 'timestamp': '2025-09-05 09:06:53.684304', 'step': 2294, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:53.880490', 'step': 2294, 'epoch': 2} {'type': 'loss', 'content': 0.31032633781433105, 'timestamp': '2025-09-05 09:06:53.882195', 'step': 2295, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:06:54.047414', 'step': 2295, 'epoch': 2} {'type': 'loss', 'content': 0.2269105762243271, 'timestamp': '2025-09-05 09:06:54.064076', 'step': 2296, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:54.308280', 'step': 2296, 'epoch': 2} {'type': 'loss', 'content': 0.2711460590362549, 'timestamp': '2025-09-05 09:06:54.310952', 'step': 2297, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:06:54.522080', 'step': 2297, 'epoch': 2} {'type': 'loss', 'content': 0.32219910621643066, 'timestamp': '2025-09-05 09:06:54.523953', 'step': 2298, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:54.730976', 'step': 2298, 'epoch': 2} {'type': 'loss', 'content': 0.3553353548049927, 'timestamp': '2025-09-05 09:06:54.732882', 'step': 2299, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:06:54.899746', 'step': 2299, 'epoch': 2} {'type': 'loss', 'content': 0.3341793119907379, 'timestamp': '2025-09-05 09:06:54.915155', 'step': 2300, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:06:59.573196', 'step': 2300, 'epoch': 2} {'type': 'pplx', 'content': 53.83812948418028, 'timestamp': '2025-09-05 09:06:59.576460', 'step': 2300, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:06:59.738712', 'step': 2300, 'epoch': 2} {'type': 'loss', 'content': 0.37382030487060547, 'timestamp': '2025-09-05 09:06:59.740310', 'step': 2301, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:06:59.907105', 'step': 2301, 'epoch': 2} {'type': 'loss', 'content': 0.3155753016471863, 'timestamp': '2025-09-05 09:06:59.908756', 'step': 2302, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:00.113462', 'step': 2302, 'epoch': 2} {'type': 'loss', 'content': 0.3371022045612335, 'timestamp': '2025-09-05 09:07:00.115231', 'step': 2303, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:00.310344', 'step': 2303, 'epoch': 2} {'type': 'loss', 'content': 0.3813401162624359, 'timestamp': '2025-09-05 09:07:00.324859', 'step': 2304, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:00.512247', 'step': 2304, 'epoch': 2} {'type': 'loss', 'content': 0.3163757920265198, 'timestamp': '2025-09-05 09:07:00.513868', 'step': 2305, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:00.709592', 'step': 2305, 'epoch': 2} {'type': 'loss', 'content': 0.4015030264854431, 'timestamp': '2025-09-05 09:07:00.713421', 'step': 2306, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:00.911204', 'step': 2306, 'epoch': 2} {'type': 'loss', 'content': 0.241668239235878, 'timestamp': '2025-09-05 09:07:00.912870', 'step': 2307, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:01.112322', 'step': 2307, 'epoch': 2} {'type': 'loss', 'content': 0.3123008906841278, 'timestamp': '2025-09-05 09:07:01.122844', 'step': 2308, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:01.284697', 'step': 2308, 'epoch': 2} {'type': 'loss', 'content': 0.22155259549617767, 'timestamp': '2025-09-05 09:07:01.286352', 'step': 2309, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:01.490379', 'step': 2309, 'epoch': 2} {'type': 'loss', 'content': 0.2877575755119324, 'timestamp': '2025-09-05 09:07:01.492069', 'step': 2310, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:01.696638', 'step': 2310, 'epoch': 2} {'type': 'loss', 'content': 0.28378915786743164, 'timestamp': '2025-09-05 09:07:01.698262', 'step': 2311, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:01.904919', 'step': 2311, 'epoch': 2} {'type': 'loss', 'content': 0.305711030960083, 'timestamp': '2025-09-05 09:07:01.914270', 'step': 2312, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:02.076917', 'step': 2312, 'epoch': 2} {'type': 'loss', 'content': 0.3109692633152008, 'timestamp': '2025-09-05 09:07:02.079113', 'step': 2313, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:02.283526', 'step': 2313, 'epoch': 2} {'type': 'loss', 'content': 0.24978035688400269, 'timestamp': '2025-09-05 09:07:02.285251', 'step': 2314, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:07:02.482566', 'step': 2314, 'epoch': 2} {'type': 'loss', 'content': 0.35233667492866516, 'timestamp': '2025-09-05 09:07:02.484199', 'step': 2315, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:02.679086', 'step': 2315, 'epoch': 2} {'type': 'loss', 'content': 0.24674127995967865, 'timestamp': '2025-09-05 09:07:02.696266', 'step': 2316, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:02.892813', 'step': 2316, 'epoch': 2} {'type': 'loss', 'content': 0.3011695444583893, 'timestamp': '2025-09-05 09:07:02.894598', 'step': 2317, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:07:03.063154', 'step': 2317, 'epoch': 2} {'type': 'loss', 'content': 0.30131420493125916, 'timestamp': '2025-09-05 09:07:03.065608', 'step': 2318, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:03.271933', 'step': 2318, 'epoch': 2} {'type': 'loss', 'content': 0.3193568289279938, 'timestamp': '2025-09-05 09:07:03.273728', 'step': 2319, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:03.479048', 'step': 2319, 'epoch': 2} {'type': 'loss', 'content': 0.2760051488876343, 'timestamp': '2025-09-05 09:07:03.493537', 'step': 2320, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:07:08.176711', 'step': 2320, 'epoch': 2} {'type': 'pplx', 'content': 53.84229306421487, 'timestamp': '2025-09-05 09:07:08.178871', 'step': 2320, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2320', 'timestamp': '2025-09-05 09:07:08.636200', 'step': 2320, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:08.804432', 'step': 2320, 'epoch': 2} {'type': 'loss', 'content': 0.3730832636356354, 'timestamp': '2025-09-05 09:07:08.806721', 'step': 2321, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:09.011245', 'step': 2321, 'epoch': 2} {'type': 'loss', 'content': 0.47483229637145996, 'timestamp': '2025-09-05 09:07:09.013084', 'step': 2322, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:09.218081', 'step': 2322, 'epoch': 2} {'type': 'loss', 'content': 0.26618441939353943, 'timestamp': '2025-09-05 09:07:09.219997', 'step': 2323, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:09.416051', 'step': 2323, 'epoch': 2} {'type': 'loss', 'content': 0.262174516916275, 'timestamp': '2025-09-05 09:07:09.425547', 'step': 2324, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:09.587653', 'step': 2324, 'epoch': 2} {'type': 'loss', 'content': 0.43635013699531555, 'timestamp': '2025-09-05 09:07:09.589717', 'step': 2325, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:09.796156', 'step': 2325, 'epoch': 2} {'type': 'loss', 'content': 0.2052556276321411, 'timestamp': '2025-09-05 09:07:09.797976', 'step': 2326, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:09.967138', 'step': 2326, 'epoch': 2} {'type': 'loss', 'content': 0.3712131083011627, 'timestamp': '2025-09-05 09:07:09.969415', 'step': 2327, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:10.164137', 'step': 2327, 'epoch': 2} {'type': 'loss', 'content': 0.35729169845581055, 'timestamp': '2025-09-05 09:07:10.178407', 'step': 2328, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:10.368304', 'step': 2328, 'epoch': 2} {'type': 'loss', 'content': 0.39892011880874634, 'timestamp': '2025-09-05 09:07:10.370520', 'step': 2329, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:10.567236', 'step': 2329, 'epoch': 2} {'type': 'loss', 'content': 0.36499282717704773, 'timestamp': '2025-09-05 09:07:10.569442', 'step': 2330, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:10.780649', 'step': 2330, 'epoch': 2} {'type': 'loss', 'content': 0.23433853685855865, 'timestamp': '2025-09-05 09:07:10.782762', 'step': 2331, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:10.979945', 'step': 2331, 'epoch': 2} {'type': 'loss', 'content': 0.24312803149223328, 'timestamp': '2025-09-05 09:07:10.994537', 'step': 2332, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:11.183633', 'step': 2332, 'epoch': 2} {'type': 'loss', 'content': 0.26388394832611084, 'timestamp': '2025-09-05 09:07:11.185595', 'step': 2333, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:11.393159', 'step': 2333, 'epoch': 2} {'type': 'loss', 'content': 0.4343104660511017, 'timestamp': '2025-09-05 09:07:11.395175', 'step': 2334, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:07:11.593175', 'step': 2334, 'epoch': 2} {'type': 'loss', 'content': 0.39986181259155273, 'timestamp': '2025-09-05 09:07:11.594881', 'step': 2335, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:11.793473', 'step': 2335, 'epoch': 2} {'type': 'loss', 'content': 0.48972994089126587, 'timestamp': '2025-09-05 09:07:11.802839', 'step': 2336, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:11.967517', 'step': 2336, 'epoch': 2} {'type': 'loss', 'content': 0.2836100459098816, 'timestamp': '2025-09-05 09:07:11.969198', 'step': 2337, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:12.174955', 'step': 2337, 'epoch': 2} {'type': 'loss', 'content': 0.2238231599330902, 'timestamp': '2025-09-05 09:07:12.177197', 'step': 2338, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:07:12.374452', 'step': 2338, 'epoch': 2} {'type': 'loss', 'content': 0.3275935649871826, 'timestamp': '2025-09-05 09:07:12.377033', 'step': 2339, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:12.573386', 'step': 2339, 'epoch': 2} {'type': 'loss', 'content': 0.31032243371009827, 'timestamp': '2025-09-05 09:07:12.587393', 'step': 2340, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:07:17.348659', 'step': 2340, 'epoch': 2} {'type': 'pplx', 'content': 54.3609793507131, 'timestamp': '2025-09-05 09:07:17.350682', 'step': 2340, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:17.511929', 'step': 2340, 'epoch': 2} {'type': 'loss', 'content': 0.37032943964004517, 'timestamp': '2025-09-05 09:07:17.514323', 'step': 2341, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:17.721157', 'step': 2341, 'epoch': 2} {'type': 'loss', 'content': 0.3330005407333374, 'timestamp': '2025-09-05 09:07:17.723262', 'step': 2342, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:17.919750', 'step': 2342, 'epoch': 2} {'type': 'loss', 'content': 0.26726004481315613, 'timestamp': '2025-09-05 09:07:17.922084', 'step': 2343, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:18.129709', 'step': 2343, 'epoch': 2} {'type': 'loss', 'content': 0.2254151552915573, 'timestamp': '2025-09-05 09:07:18.139944', 'step': 2344, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:18.302073', 'step': 2344, 'epoch': 2} {'type': 'loss', 'content': 0.2709170877933502, 'timestamp': '2025-09-05 09:07:18.306175', 'step': 2345, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:18.520095', 'step': 2345, 'epoch': 2} {'type': 'loss', 'content': 0.36739596724510193, 'timestamp': '2025-09-05 09:07:18.526131', 'step': 2346, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:18.693682', 'step': 2346, 'epoch': 2} {'type': 'loss', 'content': 0.24793685972690582, 'timestamp': '2025-09-05 09:07:18.699367', 'step': 2347, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:18.908254', 'step': 2347, 'epoch': 2} {'type': 'loss', 'content': 0.25923240184783936, 'timestamp': '2025-09-05 09:07:18.917770', 'step': 2348, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:19.083149', 'step': 2348, 'epoch': 2} {'type': 'loss', 'content': 0.4075714349746704, 'timestamp': '2025-09-05 09:07:19.086080', 'step': 2349, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:19.253820', 'step': 2349, 'epoch': 2} {'type': 'loss', 'content': 0.22864657640457153, 'timestamp': '2025-09-05 09:07:19.269288', 'step': 2350, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:19.517188', 'step': 2350, 'epoch': 2} {'type': 'loss', 'content': 0.2772950828075409, 'timestamp': '2025-09-05 09:07:19.520982', 'step': 2351, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:07:19.693977', 'step': 2351, 'epoch': 2} {'type': 'loss', 'content': 0.237502783536911, 'timestamp': '2025-09-05 09:07:19.710552', 'step': 2352, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:19.910056', 'step': 2352, 'epoch': 2} {'type': 'loss', 'content': 0.3322488069534302, 'timestamp': '2025-09-05 09:07:19.917125', 'step': 2353, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:07:20.116870', 'step': 2353, 'epoch': 2} {'type': 'loss', 'content': 0.39339199662208557, 'timestamp': '2025-09-05 09:07:20.119561', 'step': 2354, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:20.327031', 'step': 2354, 'epoch': 2} {'type': 'loss', 'content': 0.31205451488494873, 'timestamp': '2025-09-05 09:07:20.331019', 'step': 2355, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:20.540792', 'step': 2355, 'epoch': 2} {'type': 'loss', 'content': 0.19401071965694427, 'timestamp': '2025-09-05 09:07:20.550029', 'step': 2356, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:20.714882', 'step': 2356, 'epoch': 2} {'type': 'loss', 'content': 0.27951422333717346, 'timestamp': '2025-09-05 09:07:20.716627', 'step': 2357, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:07:20.924647', 'step': 2357, 'epoch': 2} {'type': 'loss', 'content': 0.3359452486038208, 'timestamp': '2025-09-05 09:07:20.926530', 'step': 2358, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:21.094918', 'step': 2358, 'epoch': 2} {'type': 'loss', 'content': 0.32733088731765747, 'timestamp': '2025-09-05 09:07:21.096900', 'step': 2359, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:21.294512', 'step': 2359, 'epoch': 2} {'type': 'loss', 'content': 0.27296555042266846, 'timestamp': '2025-09-05 09:07:21.304367', 'step': 2360, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:07:26.011972', 'step': 2360, 'epoch': 2} {'type': 'pplx', 'content': 55.69164641376557, 'timestamp': '2025-09-05 09:07:26.014599', 'step': 2360, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2360', 'timestamp': '2025-09-05 09:07:26.471307', 'step': 2360, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:07:26.639440', 'step': 2360, 'epoch': 2} {'type': 'loss', 'content': 0.3209402859210968, 'timestamp': '2025-09-05 09:07:26.641714', 'step': 2361, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:26.845836', 'step': 2361, 'epoch': 2} {'type': 'loss', 'content': 0.2272927612066269, 'timestamp': '2025-09-05 09:07:26.848505', 'step': 2362, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:27.045204', 'step': 2362, 'epoch': 2} {'type': 'loss', 'content': 0.3490237295627594, 'timestamp': '2025-09-05 09:07:27.047647', 'step': 2363, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:27.253023', 'step': 2363, 'epoch': 2} {'type': 'loss', 'content': 0.3624802231788635, 'timestamp': '2025-09-05 09:07:27.267156', 'step': 2364, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:27.457605', 'step': 2364, 'epoch': 2} {'type': 'loss', 'content': 0.36499476432800293, 'timestamp': '2025-09-05 09:07:27.459374', 'step': 2365, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:27.655960', 'step': 2365, 'epoch': 2} {'type': 'loss', 'content': 0.28135761618614197, 'timestamp': '2025-09-05 09:07:27.658541', 'step': 2366, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:07:27.858711', 'step': 2366, 'epoch': 2} {'type': 'loss', 'content': 0.345076322555542, 'timestamp': '2025-09-05 09:07:27.860970', 'step': 2367, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:07:28.059087', 'step': 2367, 'epoch': 2} {'type': 'loss', 'content': 0.41919395327568054, 'timestamp': '2025-09-05 09:07:28.068579', 'step': 2368, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:07:28.231677', 'step': 2368, 'epoch': 2} {'type': 'loss', 'content': 0.2436763346195221, 'timestamp': '2025-09-05 09:07:28.233668', 'step': 2369, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:28.399911', 'step': 2369, 'epoch': 2} {'type': 'loss', 'content': 0.23662683367729187, 'timestamp': '2025-09-05 09:07:28.402152', 'step': 2370, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:28.597574', 'step': 2370, 'epoch': 2} {'type': 'loss', 'content': 0.22401820123195648, 'timestamp': '2025-09-05 09:07:28.599367', 'step': 2371, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:07:28.765558', 'step': 2371, 'epoch': 2} {'type': 'loss', 'content': 0.23337076604366302, 'timestamp': '2025-09-05 09:07:28.782887', 'step': 2372, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:28.977131', 'step': 2372, 'epoch': 2} {'type': 'loss', 'content': 0.3868841230869293, 'timestamp': '2025-09-05 09:07:28.979493', 'step': 2373, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:29.176816', 'step': 2373, 'epoch': 2} {'type': 'loss', 'content': 0.47699031233787537, 'timestamp': '2025-09-05 09:07:29.184646', 'step': 2374, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:29.396413', 'step': 2374, 'epoch': 2} {'type': 'loss', 'content': 0.437484472990036, 'timestamp': '2025-09-05 09:07:29.398070', 'step': 2375, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:07:29.603081', 'step': 2375, 'epoch': 2} {'type': 'loss', 'content': 0.22171063721179962, 'timestamp': '2025-09-05 09:07:29.611960', 'step': 2376, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:29.772947', 'step': 2376, 'epoch': 2} {'type': 'loss', 'content': 0.2754736840724945, 'timestamp': '2025-09-05 09:07:29.774820', 'step': 2377, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:29.976846', 'step': 2377, 'epoch': 2} {'type': 'loss', 'content': 0.2224128544330597, 'timestamp': '2025-09-05 09:07:29.978350', 'step': 2378, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:30.142774', 'step': 2378, 'epoch': 2} {'type': 'loss', 'content': 0.26598674058914185, 'timestamp': '2025-09-05 09:07:30.144652', 'step': 2379, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:30.338071', 'step': 2379, 'epoch': 2} {'type': 'loss', 'content': 0.31558096408843994, 'timestamp': '2025-09-05 09:07:30.347353', 'step': 2380, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:07:35.003062', 'step': 2380, 'epoch': 2} {'type': 'pplx', 'content': 54.91370123427139, 'timestamp': '2025-09-05 09:07:35.005288', 'step': 2380, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:35.166638', 'step': 2380, 'epoch': 2} {'type': 'loss', 'content': 0.3278944790363312, 'timestamp': '2025-09-05 09:07:35.168373', 'step': 2381, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:35.334452', 'step': 2381, 'epoch': 2} {'type': 'loss', 'content': 0.3350487947463989, 'timestamp': '2025-09-05 09:07:35.336389', 'step': 2382, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:35.502370', 'step': 2382, 'epoch': 2} {'type': 'loss', 'content': 0.2530112862586975, 'timestamp': '2025-09-05 09:07:35.504209', 'step': 2383, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:35.671250', 'step': 2383, 'epoch': 2} {'type': 'loss', 'content': 0.34709376096725464, 'timestamp': '2025-09-05 09:07:35.680926', 'step': 2384, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:35.845859', 'step': 2384, 'epoch': 2} {'type': 'loss', 'content': 0.2088186889886856, 'timestamp': '2025-09-05 09:07:35.847816', 'step': 2385, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:36.013801', 'step': 2385, 'epoch': 2} {'type': 'loss', 'content': 0.2869564890861511, 'timestamp': '2025-09-05 09:07:36.015947', 'step': 2386, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:36.181958', 'step': 2386, 'epoch': 2} {'type': 'loss', 'content': 0.3361554741859436, 'timestamp': '2025-09-05 09:07:36.184163', 'step': 2387, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:36.352174', 'step': 2387, 'epoch': 2} {'type': 'loss', 'content': 0.35683155059814453, 'timestamp': '2025-09-05 09:07:36.361191', 'step': 2388, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:36.525010', 'step': 2388, 'epoch': 2} {'type': 'loss', 'content': 0.4390067756175995, 'timestamp': '2025-09-05 09:07:36.526684', 'step': 2389, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:07:36.691073', 'step': 2389, 'epoch': 2} {'type': 'loss', 'content': 0.3709789514541626, 'timestamp': '2025-09-05 09:07:36.693111', 'step': 2390, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:07:36.862890', 'step': 2390, 'epoch': 2} {'type': 'loss', 'content': 0.2603262960910797, 'timestamp': '2025-09-05 09:07:36.865311', 'step': 2391, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:37.031735', 'step': 2391, 'epoch': 2} {'type': 'loss', 'content': 0.23307450115680695, 'timestamp': '2025-09-05 09:07:37.041183', 'step': 2392, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:37.204920', 'step': 2392, 'epoch': 2} {'type': 'loss', 'content': 0.27466168999671936, 'timestamp': '2025-09-05 09:07:37.206775', 'step': 2393, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:07:37.373885', 'step': 2393, 'epoch': 2} {'type': 'loss', 'content': 0.2780590355396271, 'timestamp': '2025-09-05 09:07:37.375972', 'step': 2394, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:37.542987', 'step': 2394, 'epoch': 2} {'type': 'loss', 'content': 0.17993779480457306, 'timestamp': '2025-09-05 09:07:37.545373', 'step': 2395, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:37.711127', 'step': 2395, 'epoch': 2} {'type': 'loss', 'content': 0.33172348141670227, 'timestamp': '2025-09-05 09:07:37.777082', 'step': 2396, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:37.982581', 'step': 2396, 'epoch': 2} {'type': 'loss', 'content': 0.23976249992847443, 'timestamp': '2025-09-05 09:07:37.985053', 'step': 2397, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:07:38.166172', 'step': 2397, 'epoch': 2} {'type': 'loss', 'content': 0.3238065540790558, 'timestamp': '2025-09-05 09:07:38.168100', 'step': 2398, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:07:38.327669', 'step': 2398, 'epoch': 2} {'type': 'loss', 'content': 0.2310824692249298, 'timestamp': '2025-09-05 09:07:38.329756', 'step': 2399, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:38.488943', 'step': 2399, 'epoch': 2} {'type': 'loss', 'content': 0.13710328936576843, 'timestamp': '2025-09-05 09:07:38.503058', 'step': 2400, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:07:43.150193', 'step': 2400, 'epoch': 2} {'type': 'pplx', 'content': 53.65770720978935, 'timestamp': '2025-09-05 09:07:43.152234', 'step': 2400, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2400', 'timestamp': '2025-09-05 09:07:43.622167', 'step': 2400, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:43.756497', 'step': 2400, 'epoch': 2} {'type': 'loss', 'content': 0.3784136474132538, 'timestamp': '2025-09-05 09:07:43.758846', 'step': 2401, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:43.928455', 'step': 2401, 'epoch': 2} {'type': 'loss', 'content': 0.28840699791908264, 'timestamp': '2025-09-05 09:07:43.930366', 'step': 2402, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:44.100558', 'step': 2402, 'epoch': 2} {'type': 'loss', 'content': 0.12435296177864075, 'timestamp': '2025-09-05 09:07:44.103005', 'step': 2403, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:44.339719', 'step': 2403, 'epoch': 2} {'type': 'loss', 'content': 0.19681872427463531, 'timestamp': '2025-09-05 09:07:44.355064', 'step': 2404, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:44.509599', 'step': 2404, 'epoch': 2} {'type': 'loss', 'content': 0.29926079511642456, 'timestamp': '2025-09-05 09:07:44.511796', 'step': 2405, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:44.682182', 'step': 2405, 'epoch': 2} {'type': 'loss', 'content': 0.39890509843826294, 'timestamp': '2025-09-05 09:07:44.684493', 'step': 2406, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:44.855392', 'step': 2406, 'epoch': 2} {'type': 'loss', 'content': 0.34845206141471863, 'timestamp': '2025-09-05 09:07:44.857439', 'step': 2407, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 5440033091648.0}, 'timestamp': '2025-09-05 09:07:45.021834', 'step': 2407, 'epoch': 2} {'type': 'loss', 'content': 0.4749622046947479, 'timestamp': '2025-09-05 09:07:45.036535', 'step': 2408, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:45.289008', 'step': 2408, 'epoch': 2} {'type': 'loss', 'content': 0.17726445198059082, 'timestamp': '2025-09-05 09:07:45.314028', 'step': 2409, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:45.497454', 'step': 2409, 'epoch': 2} {'type': 'loss', 'content': 0.3263298571109772, 'timestamp': '2025-09-05 09:07:45.514542', 'step': 2410, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:45.745066', 'step': 2410, 'epoch': 2} {'type': 'loss', 'content': 0.2775588929653168, 'timestamp': '2025-09-05 09:07:45.747377', 'step': 2411, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:45.919583', 'step': 2411, 'epoch': 2} {'type': 'loss', 'content': 0.3065100610256195, 'timestamp': '2025-09-05 09:07:45.975180', 'step': 2412, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:46.147292', 'step': 2412, 'epoch': 2} {'type': 'loss', 'content': 0.28606003522872925, 'timestamp': '2025-09-05 09:07:46.149413', 'step': 2413, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:46.320168', 'step': 2413, 'epoch': 2} {'type': 'loss', 'content': 0.2223246544599533, 'timestamp': '2025-09-05 09:07:46.322268', 'step': 2414, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:46.481163', 'step': 2414, 'epoch': 2} {'type': 'loss', 'content': 0.4483068585395813, 'timestamp': '2025-09-05 09:07:46.483191', 'step': 2415, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:46.678206', 'step': 2415, 'epoch': 2} {'type': 'loss', 'content': 0.31489551067352295, 'timestamp': '2025-09-05 09:07:46.692575', 'step': 2416, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:07:46.844050', 'step': 2416, 'epoch': 2} {'type': 'loss', 'content': 0.2865988612174988, 'timestamp': '2025-09-05 09:07:46.846269', 'step': 2417, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:47.012544', 'step': 2417, 'epoch': 2} {'type': 'loss', 'content': 0.3174918293952942, 'timestamp': '2025-09-05 09:07:47.014872', 'step': 2418, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:47.186121', 'step': 2418, 'epoch': 2} {'type': 'loss', 'content': 0.26380813121795654, 'timestamp': '2025-09-05 09:07:47.188686', 'step': 2419, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:47.360860', 'step': 2419, 'epoch': 2} {'type': 'loss', 'content': 0.25339600443840027, 'timestamp': '2025-09-05 09:07:47.375096', 'step': 2420, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:07:52.646084', 'step': 2420, 'epoch': 2} {'type': 'pplx', 'content': 54.45162833560751, 'timestamp': '2025-09-05 09:07:52.648300', 'step': 2420, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:07:52.779251', 'step': 2420, 'epoch': 2} {'type': 'loss', 'content': 0.21093280613422394, 'timestamp': '2025-09-05 09:07:52.781980', 'step': 2421, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:52.938785', 'step': 2421, 'epoch': 2} {'type': 'loss', 'content': 0.24759776890277863, 'timestamp': '2025-09-05 09:07:53.020176', 'step': 2422, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:07:53.213342', 'step': 2422, 'epoch': 2} {'type': 'loss', 'content': 0.19092871248722076, 'timestamp': '2025-09-05 09:07:53.215194', 'step': 2423, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:07:53.382864', 'step': 2423, 'epoch': 2} {'type': 'loss', 'content': 0.38986799120903015, 'timestamp': '2025-09-05 09:07:53.396260', 'step': 2424, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:53.548122', 'step': 2424, 'epoch': 2} {'type': 'loss', 'content': 0.25403735041618347, 'timestamp': '2025-09-05 09:07:53.550389', 'step': 2425, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:53.707715', 'step': 2425, 'epoch': 2} {'type': 'loss', 'content': 0.39463475346565247, 'timestamp': '2025-09-05 09:07:53.736535', 'step': 2426, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:07:53.949730', 'step': 2426, 'epoch': 2} {'type': 'loss', 'content': 0.27436649799346924, 'timestamp': '2025-09-05 09:07:53.982038', 'step': 2427, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:54.210833', 'step': 2427, 'epoch': 2} {'type': 'loss', 'content': 0.3307807147502899, 'timestamp': '2025-09-05 09:07:54.225106', 'step': 2428, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:54.380225', 'step': 2428, 'epoch': 2} {'type': 'loss', 'content': 0.5084760785102844, 'timestamp': '2025-09-05 09:07:54.382463', 'step': 2429, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:54.552231', 'step': 2429, 'epoch': 2} {'type': 'loss', 'content': 0.2736986577510834, 'timestamp': '2025-09-05 09:07:54.554064', 'step': 2430, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:07:54.725829', 'step': 2430, 'epoch': 2} {'type': 'loss', 'content': 0.2925279140472412, 'timestamp': '2025-09-05 09:07:54.728433', 'step': 2431, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:07:54.886581', 'step': 2431, 'epoch': 2} {'type': 'loss', 'content': 0.27017101645469666, 'timestamp': '2025-09-05 09:07:54.903369', 'step': 2432, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:55.063357', 'step': 2432, 'epoch': 2} {'type': 'loss', 'content': 0.2796265780925751, 'timestamp': '2025-09-05 09:07:55.066444', 'step': 2433, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:55.224425', 'step': 2433, 'epoch': 2} {'type': 'loss', 'content': 0.35782158374786377, 'timestamp': '2025-09-05 09:07:55.227028', 'step': 2434, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:07:55.364722', 'step': 2434, 'epoch': 2} {'type': 'loss', 'content': 0.3305521607398987, 'timestamp': '2025-09-05 09:07:55.366733', 'step': 2435, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:55.538296', 'step': 2435, 'epoch': 2} {'type': 'loss', 'content': 0.3081381618976593, 'timestamp': '2025-09-05 09:07:55.555184', 'step': 2436, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:55.716234', 'step': 2436, 'epoch': 2} {'type': 'loss', 'content': 0.3570384681224823, 'timestamp': '2025-09-05 09:07:55.718614', 'step': 2437, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:07:55.888053', 'step': 2437, 'epoch': 2} {'type': 'loss', 'content': 0.15754249691963196, 'timestamp': '2025-09-05 09:07:55.890184', 'step': 2438, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:07:56.056501', 'step': 2438, 'epoch': 2} {'type': 'loss', 'content': 0.20244644582271576, 'timestamp': '2025-09-05 09:07:56.058763', 'step': 2439, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:07:56.216598', 'step': 2439, 'epoch': 2} {'type': 'loss', 'content': 0.2593131959438324, 'timestamp': '2025-09-05 09:07:56.230745', 'step': 2440, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:08:01.348684', 'step': 2440, 'epoch': 2} {'type': 'pplx', 'content': 54.726233382312806, 'timestamp': '2025-09-05 09:08:01.352558', 'step': 2440, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2440', 'timestamp': '2025-09-05 09:08:01.916075', 'step': 2440, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:02.075987', 'step': 2440, 'epoch': 2} {'type': 'loss', 'content': 0.22823497653007507, 'timestamp': '2025-09-05 09:08:02.078058', 'step': 2441, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:02.251582', 'step': 2441, 'epoch': 2} {'type': 'loss', 'content': 0.13066086173057556, 'timestamp': '2025-09-05 09:08:02.254006', 'step': 2442, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:02.418800', 'step': 2442, 'epoch': 2} {'type': 'loss', 'content': 0.3410320580005646, 'timestamp': '2025-09-05 09:08:02.420955', 'step': 2443, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:02.586353', 'step': 2443, 'epoch': 2} {'type': 'loss', 'content': 0.3090682327747345, 'timestamp': '2025-09-05 09:08:02.600282', 'step': 2444, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:02.757752', 'step': 2444, 'epoch': 2} {'type': 'loss', 'content': 0.3452575206756592, 'timestamp': '2025-09-05 09:08:02.760185', 'step': 2445, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:02.927518', 'step': 2445, 'epoch': 2} {'type': 'loss', 'content': 0.31340721249580383, 'timestamp': '2025-09-05 09:08:02.929857', 'step': 2446, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:03.190905', 'step': 2446, 'epoch': 2} {'type': 'loss', 'content': 0.3519221544265747, 'timestamp': '2025-09-05 09:08:03.192880', 'step': 2447, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:08:03.367264', 'step': 2447, 'epoch': 2} {'type': 'loss', 'content': 0.5222123265266418, 'timestamp': '2025-09-05 09:08:03.381834', 'step': 2448, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:03.540496', 'step': 2448, 'epoch': 2} {'type': 'loss', 'content': 0.25161540508270264, 'timestamp': '2025-09-05 09:08:03.542795', 'step': 2449, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:03.709099', 'step': 2449, 'epoch': 2} {'type': 'loss', 'content': 0.28207287192344666, 'timestamp': '2025-09-05 09:08:03.752015', 'step': 2450, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:04.036556', 'step': 2450, 'epoch': 2} {'type': 'loss', 'content': 0.4279467761516571, 'timestamp': '2025-09-05 09:08:04.039030', 'step': 2451, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:04.354549', 'step': 2451, 'epoch': 2} {'type': 'loss', 'content': 0.32319822907447815, 'timestamp': '2025-09-05 09:08:04.369029', 'step': 2452, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:04.538604', 'step': 2452, 'epoch': 2} {'type': 'loss', 'content': 0.27443209290504456, 'timestamp': '2025-09-05 09:08:04.598961', 'step': 2453, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:04.818752', 'step': 2453, 'epoch': 2} {'type': 'loss', 'content': 0.2456836998462677, 'timestamp': '2025-09-05 09:08:04.821148', 'step': 2454, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:04.988344', 'step': 2454, 'epoch': 2} {'type': 'loss', 'content': 0.35845696926116943, 'timestamp': '2025-09-05 09:08:04.990202', 'step': 2455, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:05.154815', 'step': 2455, 'epoch': 2} {'type': 'loss', 'content': 0.40192878246307373, 'timestamp': '2025-09-05 09:08:05.171087', 'step': 2456, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:05.339221', 'step': 2456, 'epoch': 2} {'type': 'loss', 'content': 0.38854965567588806, 'timestamp': '2025-09-05 09:08:05.342153', 'step': 2457, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:05.506686', 'step': 2457, 'epoch': 2} {'type': 'loss', 'content': 0.36593395471572876, 'timestamp': '2025-09-05 09:08:05.508864', 'step': 2458, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:05.673878', 'step': 2458, 'epoch': 2} {'type': 'loss', 'content': 0.3558920621871948, 'timestamp': '2025-09-05 09:08:05.676132', 'step': 2459, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:08:05.841507', 'step': 2459, 'epoch': 2} {'type': 'loss', 'content': 0.23418471217155457, 'timestamp': '2025-09-05 09:08:05.858314', 'step': 2460, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:08:10.996179', 'step': 2460, 'epoch': 2} {'type': 'pplx', 'content': 53.48732085430296, 'timestamp': '2025-09-05 09:08:11.039993', 'step': 2460, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:11.173071', 'step': 2460, 'epoch': 2} {'type': 'loss', 'content': 0.2769435942173004, 'timestamp': '2025-09-05 09:08:11.189561', 'step': 2461, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:11.452212', 'step': 2461, 'epoch': 2} {'type': 'loss', 'content': 0.44493263959884644, 'timestamp': '2025-09-05 09:08:11.454879', 'step': 2462, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:11.710370', 'step': 2462, 'epoch': 2} {'type': 'loss', 'content': 0.274783194065094, 'timestamp': '2025-09-05 09:08:11.712867', 'step': 2463, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:11.932413', 'step': 2463, 'epoch': 2} {'type': 'loss', 'content': 0.2961232364177704, 'timestamp': '2025-09-05 09:08:11.945952', 'step': 2464, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:12.106328', 'step': 2464, 'epoch': 2} {'type': 'loss', 'content': 0.19862420856952667, 'timestamp': '2025-09-05 09:08:12.108278', 'step': 2465, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:12.286236', 'step': 2465, 'epoch': 2} {'type': 'loss', 'content': 0.49661245942115784, 'timestamp': '2025-09-05 09:08:12.288268', 'step': 2466, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:12.423310', 'step': 2466, 'epoch': 2} {'type': 'loss', 'content': 0.3499586284160614, 'timestamp': '2025-09-05 09:08:12.425494', 'step': 2467, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:12.590694', 'step': 2467, 'epoch': 2} {'type': 'loss', 'content': 0.3866555392742157, 'timestamp': '2025-09-05 09:08:12.604767', 'step': 2468, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:12.762857', 'step': 2468, 'epoch': 2} {'type': 'loss', 'content': 0.15350095927715302, 'timestamp': '2025-09-05 09:08:12.765656', 'step': 2469, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:12.930539', 'step': 2469, 'epoch': 2} {'type': 'loss', 'content': 0.2772982716560364, 'timestamp': '2025-09-05 09:08:12.933067', 'step': 2470, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:13.150777', 'step': 2470, 'epoch': 2} {'type': 'loss', 'content': 0.2272377461194992, 'timestamp': '2025-09-05 09:08:13.153293', 'step': 2471, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:13.323513', 'step': 2471, 'epoch': 2} {'type': 'loss', 'content': 0.3104439973831177, 'timestamp': '2025-09-05 09:08:13.339831', 'step': 2472, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:13.506609', 'step': 2472, 'epoch': 2} {'type': 'loss', 'content': 0.3005099892616272, 'timestamp': '2025-09-05 09:08:13.509163', 'step': 2473, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:13.728381', 'step': 2473, 'epoch': 2} {'type': 'loss', 'content': 0.32455500960350037, 'timestamp': '2025-09-05 09:08:13.730644', 'step': 2474, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:13.971396', 'step': 2474, 'epoch': 2} {'type': 'loss', 'content': 0.3721053898334503, 'timestamp': '2025-09-05 09:08:13.995417', 'step': 2475, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:14.176746', 'step': 2475, 'epoch': 2} {'type': 'loss', 'content': 0.33817243576049805, 'timestamp': '2025-09-05 09:08:14.194035', 'step': 2476, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:14.364770', 'step': 2476, 'epoch': 2} {'type': 'loss', 'content': 0.4381646513938904, 'timestamp': '2025-09-05 09:08:14.367767', 'step': 2477, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:14.586996', 'step': 2477, 'epoch': 2} {'type': 'loss', 'content': 0.45706042647361755, 'timestamp': '2025-09-05 09:08:14.589275', 'step': 2478, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:14.811775', 'step': 2478, 'epoch': 2} {'type': 'loss', 'content': 0.3688715994358063, 'timestamp': '2025-09-05 09:08:14.813672', 'step': 2479, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:15.011990', 'step': 2479, 'epoch': 2} {'type': 'loss', 'content': 0.365582138299942, 'timestamp': '2025-09-05 09:08:15.025519', 'step': 2480, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:08:20.267144', 'step': 2480, 'epoch': 2} {'type': 'pplx', 'content': 52.49038722586114, 'timestamp': '2025-09-05 09:08:20.269130', 'step': 2480, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2480', 'timestamp': '2025-09-05 09:08:20.748386', 'step': 2480, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:20.908304', 'step': 2480, 'epoch': 2} {'type': 'loss', 'content': 0.3301137685775757, 'timestamp': '2025-09-05 09:08:20.911345', 'step': 2481, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:21.048283', 'step': 2481, 'epoch': 2} {'type': 'loss', 'content': 0.2704985737800598, 'timestamp': '2025-09-05 09:08:21.051353', 'step': 2482, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:21.272855', 'step': 2482, 'epoch': 2} {'type': 'loss', 'content': 0.24071909487247467, 'timestamp': '2025-09-05 09:08:21.316201', 'step': 2483, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:08:21.495818', 'step': 2483, 'epoch': 2} {'type': 'loss', 'content': 0.27359169721603394, 'timestamp': '2025-09-05 09:08:21.585302', 'step': 2484, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:21.776756', 'step': 2484, 'epoch': 2} {'type': 'loss', 'content': 0.30878910422325134, 'timestamp': '2025-09-05 09:08:21.779385', 'step': 2485, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:21.944264', 'step': 2485, 'epoch': 2} {'type': 'loss', 'content': 0.3247639834880829, 'timestamp': '2025-09-05 09:08:21.986129', 'step': 2486, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:22.212722', 'step': 2486, 'epoch': 2} {'type': 'loss', 'content': 0.2619643211364746, 'timestamp': '2025-09-05 09:08:22.233914', 'step': 2487, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:22.458476', 'step': 2487, 'epoch': 2} {'type': 'loss', 'content': 0.26061657071113586, 'timestamp': '2025-09-05 09:08:22.472954', 'step': 2488, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:22.632179', 'step': 2488, 'epoch': 2} {'type': 'loss', 'content': 0.3099912106990814, 'timestamp': '2025-09-05 09:08:22.635510', 'step': 2489, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:22.799538', 'step': 2489, 'epoch': 2} {'type': 'loss', 'content': 0.21963852643966675, 'timestamp': '2025-09-05 09:08:22.802163', 'step': 2490, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:22.978088', 'step': 2490, 'epoch': 2} {'type': 'loss', 'content': 0.3000223636627197, 'timestamp': '2025-09-05 09:08:22.981287', 'step': 2491, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:23.157663', 'step': 2491, 'epoch': 2} {'type': 'loss', 'content': 0.3209587335586548, 'timestamp': '2025-09-05 09:08:23.174124', 'step': 2492, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:23.342854', 'step': 2492, 'epoch': 2} {'type': 'loss', 'content': 0.33791443705558777, 'timestamp': '2025-09-05 09:08:23.346174', 'step': 2493, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:23.510842', 'step': 2493, 'epoch': 2} {'type': 'loss', 'content': 0.5459884405136108, 'timestamp': '2025-09-05 09:08:23.513096', 'step': 2494, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:08:23.679341', 'step': 2494, 'epoch': 2} {'type': 'loss', 'content': 0.2700771987438202, 'timestamp': '2025-09-05 09:08:23.681816', 'step': 2495, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:23.847783', 'step': 2495, 'epoch': 2} {'type': 'loss', 'content': 0.30430659651756287, 'timestamp': '2025-09-05 09:08:23.858129', 'step': 2496, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:23.992946', 'step': 2496, 'epoch': 2} {'type': 'loss', 'content': 0.26771894097328186, 'timestamp': '2025-09-05 09:08:23.995222', 'step': 2497, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:24.257840', 'step': 2497, 'epoch': 2} {'type': 'loss', 'content': 0.32667091488838196, 'timestamp': '2025-09-05 09:08:24.300330', 'step': 2498, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:24.522074', 'step': 2498, 'epoch': 2} {'type': 'loss', 'content': 0.29449260234832764, 'timestamp': '2025-09-05 09:08:24.524635', 'step': 2499, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:24.689395', 'step': 2499, 'epoch': 2} {'type': 'loss', 'content': 0.2695910930633545, 'timestamp': '2025-09-05 09:08:24.703400', 'step': 2500, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:08:30.131163', 'step': 2500, 'epoch': 2} {'type': 'pplx', 'content': 52.813743962987445, 'timestamp': '2025-09-05 09:08:30.133472', 'step': 2500, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:30.266354', 'step': 2500, 'epoch': 2} {'type': 'loss', 'content': 0.33899521827697754, 'timestamp': '2025-09-05 09:08:30.268500', 'step': 2501, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:30.406032', 'step': 2501, 'epoch': 2} {'type': 'loss', 'content': 0.4425645172595978, 'timestamp': '2025-09-05 09:08:30.409138', 'step': 2502, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:30.730139', 'step': 2502, 'epoch': 2} {'type': 'loss', 'content': 0.3839690387248993, 'timestamp': '2025-09-05 09:08:30.732271', 'step': 2503, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:30.944016', 'step': 2503, 'epoch': 2} {'type': 'loss', 'content': 0.28737911581993103, 'timestamp': '2025-09-05 09:08:30.958478', 'step': 2504, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:31.147199', 'step': 2504, 'epoch': 2} {'type': 'loss', 'content': 0.5052091479301453, 'timestamp': '2025-09-05 09:08:31.171139', 'step': 2505, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:31.513127', 'step': 2505, 'epoch': 2} {'type': 'loss', 'content': 0.34135702252388, 'timestamp': '2025-09-05 09:08:31.515377', 'step': 2506, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:31.720097', 'step': 2506, 'epoch': 2} {'type': 'loss', 'content': 0.22179536521434784, 'timestamp': '2025-09-05 09:08:31.722369', 'step': 2507, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:31.890591', 'step': 2507, 'epoch': 2} {'type': 'loss', 'content': 0.3495122790336609, 'timestamp': '2025-09-05 09:08:31.907828', 'step': 2508, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:32.101530', 'step': 2508, 'epoch': 2} {'type': 'loss', 'content': 0.252922922372818, 'timestamp': '2025-09-05 09:08:32.105574', 'step': 2509, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:32.310055', 'step': 2509, 'epoch': 2} {'type': 'loss', 'content': 0.2928535044193268, 'timestamp': '2025-09-05 09:08:32.312376', 'step': 2510, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:32.561305', 'step': 2510, 'epoch': 2} {'type': 'loss', 'content': 0.3097461462020874, 'timestamp': '2025-09-05 09:08:32.564710', 'step': 2511, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:08:32.772402', 'step': 2511, 'epoch': 2} {'type': 'loss', 'content': 0.3948187232017517, 'timestamp': '2025-09-05 09:08:32.789068', 'step': 2512, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:33.031818', 'step': 2512, 'epoch': 2} {'type': 'loss', 'content': 0.2657097578048706, 'timestamp': '2025-09-05 09:08:33.034507', 'step': 2513, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:33.241556', 'step': 2513, 'epoch': 2} {'type': 'loss', 'content': 0.3475334346294403, 'timestamp': '2025-09-05 09:08:33.244226', 'step': 2514, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:33.451829', 'step': 2514, 'epoch': 2} {'type': 'loss', 'content': 0.2843340039253235, 'timestamp': '2025-09-05 09:08:33.453852', 'step': 2515, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:33.658405', 'step': 2515, 'epoch': 2} {'type': 'loss', 'content': 0.4441480338573456, 'timestamp': '2025-09-05 09:08:33.673480', 'step': 2516, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:33.862983', 'step': 2516, 'epoch': 2} {'type': 'loss', 'content': 0.24662137031555176, 'timestamp': '2025-09-05 09:08:33.865600', 'step': 2517, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:34.080449', 'step': 2517, 'epoch': 2} {'type': 'loss', 'content': 0.40211036801338196, 'timestamp': '2025-09-05 09:08:34.082800', 'step': 2518, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:34.371690', 'step': 2518, 'epoch': 2} {'type': 'loss', 'content': 0.4713238477706909, 'timestamp': '2025-09-05 09:08:34.374322', 'step': 2519, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:34.571969', 'step': 2519, 'epoch': 2} {'type': 'loss', 'content': 0.3440394401550293, 'timestamp': '2025-09-05 09:08:34.587486', 'step': 2520, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:08:40.157649', 'step': 2520, 'epoch': 2} {'type': 'pplx', 'content': 53.0074163999356, 'timestamp': '2025-09-05 09:08:40.159389', 'step': 2520, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2520', 'timestamp': '2025-09-05 09:08:40.670948', 'step': 2520, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:40.870996', 'step': 2520, 'epoch': 2} {'type': 'loss', 'content': 0.2662496566772461, 'timestamp': '2025-09-05 09:08:40.873982', 'step': 2521, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:41.068057', 'step': 2521, 'epoch': 2} {'type': 'loss', 'content': 0.2788788974285126, 'timestamp': '2025-09-05 09:08:41.070930', 'step': 2522, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:41.236468', 'step': 2522, 'epoch': 2} {'type': 'loss', 'content': 0.2379118651151657, 'timestamp': '2025-09-05 09:08:41.240108', 'step': 2523, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:41.445747', 'step': 2523, 'epoch': 2} {'type': 'loss', 'content': 0.2710990309715271, 'timestamp': '2025-09-05 09:08:41.460173', 'step': 2524, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:41.647392', 'step': 2524, 'epoch': 2} {'type': 'loss', 'content': 0.35253018140792847, 'timestamp': '2025-09-05 09:08:41.649673', 'step': 2525, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:41.899382', 'step': 2525, 'epoch': 2} {'type': 'loss', 'content': 0.33853983879089355, 'timestamp': '2025-09-05 09:08:41.903036', 'step': 2526, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:42.152763', 'step': 2526, 'epoch': 2} {'type': 'loss', 'content': 0.2997421324253082, 'timestamp': '2025-09-05 09:08:42.195973', 'step': 2527, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:08:42.443160', 'step': 2527, 'epoch': 2} {'type': 'loss', 'content': 0.22982150316238403, 'timestamp': '2025-09-05 09:08:42.460245', 'step': 2528, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:42.657316', 'step': 2528, 'epoch': 2} {'type': 'loss', 'content': 0.27433791756629944, 'timestamp': '2025-09-05 09:08:42.661098', 'step': 2529, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:42.859379', 'step': 2529, 'epoch': 2} {'type': 'loss', 'content': 0.4138650894165039, 'timestamp': '2025-09-05 09:08:42.861980', 'step': 2530, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:43.057651', 'step': 2530, 'epoch': 2} {'type': 'loss', 'content': 0.5147197842597961, 'timestamp': '2025-09-05 09:08:43.061658', 'step': 2531, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:43.258384', 'step': 2531, 'epoch': 2} {'type': 'loss', 'content': 0.3644541800022125, 'timestamp': '2025-09-05 09:08:43.277457', 'step': 2532, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:43.595788', 'step': 2532, 'epoch': 2} {'type': 'loss', 'content': 0.21938161551952362, 'timestamp': '2025-09-05 09:08:43.598479', 'step': 2533, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:43.796759', 'step': 2533, 'epoch': 2} {'type': 'loss', 'content': 0.3870631754398346, 'timestamp': '2025-09-05 09:08:43.799743', 'step': 2534, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:43.999027', 'step': 2534, 'epoch': 2} {'type': 'loss', 'content': 0.1749730259180069, 'timestamp': '2025-09-05 09:08:44.002043', 'step': 2535, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:44.198000', 'step': 2535, 'epoch': 2} {'type': 'loss', 'content': 0.2818840742111206, 'timestamp': '2025-09-05 09:08:44.212763', 'step': 2536, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:44.400117', 'step': 2536, 'epoch': 2} {'type': 'loss', 'content': 0.22080212831497192, 'timestamp': '2025-09-05 09:08:44.403454', 'step': 2537, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:44.610140', 'step': 2537, 'epoch': 2} {'type': 'loss', 'content': 0.2120576649904251, 'timestamp': '2025-09-05 09:08:44.613452', 'step': 2538, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:44.862843', 'step': 2538, 'epoch': 2} {'type': 'loss', 'content': 0.17697148025035858, 'timestamp': '2025-09-05 09:08:44.864931', 'step': 2539, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:45.040233', 'step': 2539, 'epoch': 2} {'type': 'loss', 'content': 0.37181296944618225, 'timestamp': '2025-09-05 09:08:45.055138', 'step': 2540, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:08:50.212502', 'step': 2540, 'epoch': 2} {'type': 'pplx', 'content': 53.064035598436966, 'timestamp': '2025-09-05 09:08:50.214408', 'step': 2540, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:50.346568', 'step': 2540, 'epoch': 2} {'type': 'loss', 'content': 0.3222182095050812, 'timestamp': '2025-09-05 09:08:50.348433', 'step': 2541, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:50.484589', 'step': 2541, 'epoch': 2} {'type': 'loss', 'content': 0.15369223058223724, 'timestamp': '2025-09-05 09:08:50.486899', 'step': 2542, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:50.622707', 'step': 2542, 'epoch': 2} {'type': 'loss', 'content': 0.37195268273353577, 'timestamp': '2025-09-05 09:08:50.624641', 'step': 2543, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:50.761152', 'step': 2543, 'epoch': 2} {'type': 'loss', 'content': 0.3791513741016388, 'timestamp': '2025-09-05 09:08:50.769844', 'step': 2544, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:50.902437', 'step': 2544, 'epoch': 2} {'type': 'loss', 'content': 0.25493812561035156, 'timestamp': '2025-09-05 09:08:50.904432', 'step': 2545, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:51.062401', 'step': 2545, 'epoch': 2} {'type': 'loss', 'content': 0.34883612394332886, 'timestamp': '2025-09-05 09:08:51.064212', 'step': 2546, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:08:51.235180', 'step': 2546, 'epoch': 2} {'type': 'loss', 'content': 0.36856377124786377, 'timestamp': '2025-09-05 09:08:51.237348', 'step': 2547, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:51.372850', 'step': 2547, 'epoch': 2} {'type': 'loss', 'content': 0.27879372239112854, 'timestamp': '2025-09-05 09:08:51.386856', 'step': 2548, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:51.538291', 'step': 2548, 'epoch': 2} {'type': 'loss', 'content': 0.26352420449256897, 'timestamp': '2025-09-05 09:08:51.539972', 'step': 2549, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:51.709065', 'step': 2549, 'epoch': 2} {'type': 'loss', 'content': 0.3046532869338989, 'timestamp': '2025-09-05 09:08:51.711027', 'step': 2550, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:08:51.872075', 'step': 2550, 'epoch': 2} {'type': 'loss', 'content': 0.45060494542121887, 'timestamp': '2025-09-05 09:08:51.876519', 'step': 2551, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:52.036405', 'step': 2551, 'epoch': 2} {'type': 'loss', 'content': 0.4088442027568817, 'timestamp': '2025-09-05 09:08:52.050278', 'step': 2552, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:52.201867', 'step': 2552, 'epoch': 2} {'type': 'loss', 'content': 0.2506445646286011, 'timestamp': '2025-09-05 09:08:52.203855', 'step': 2553, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:52.361528', 'step': 2553, 'epoch': 2} {'type': 'loss', 'content': 0.2608177363872528, 'timestamp': '2025-09-05 09:08:52.363491', 'step': 2554, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:52.521849', 'step': 2554, 'epoch': 2} {'type': 'loss', 'content': 0.35635024309158325, 'timestamp': '2025-09-05 09:08:52.523750', 'step': 2555, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:52.681338', 'step': 2555, 'epoch': 2} {'type': 'loss', 'content': 0.3468969166278839, 'timestamp': '2025-09-05 09:08:52.696825', 'step': 2556, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:52.855565', 'step': 2556, 'epoch': 2} {'type': 'loss', 'content': 0.421159029006958, 'timestamp': '2025-09-05 09:08:52.857810', 'step': 2557, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:08:53.018521', 'step': 2557, 'epoch': 2} {'type': 'loss', 'content': 0.3261510729789734, 'timestamp': '2025-09-05 09:08:53.020341', 'step': 2558, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:53.190276', 'step': 2558, 'epoch': 2} {'type': 'loss', 'content': 0.4048421084880829, 'timestamp': '2025-09-05 09:08:53.192324', 'step': 2559, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:08:53.362579', 'step': 2559, 'epoch': 2} {'type': 'loss', 'content': 0.41077789664268494, 'timestamp': '2025-09-05 09:08:53.378509', 'step': 2560, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:08:58.019506', 'step': 2560, 'epoch': 2} {'type': 'pplx', 'content': 54.037454830465165, 'timestamp': '2025-09-05 09:08:58.021631', 'step': 2560, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2560', 'timestamp': '2025-09-05 09:08:58.543085', 'step': 2560, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:58.681373', 'step': 2560, 'epoch': 2} {'type': 'loss', 'content': 0.25367656350135803, 'timestamp': '2025-09-05 09:08:58.683466', 'step': 2561, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:58.839482', 'step': 2561, 'epoch': 2} {'type': 'loss', 'content': 0.2890976667404175, 'timestamp': '2025-09-05 09:08:58.841709', 'step': 2562, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:58.999471', 'step': 2562, 'epoch': 2} {'type': 'loss', 'content': 0.3569534718990326, 'timestamp': '2025-09-05 09:08:59.001771', 'step': 2563, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:08:59.170317', 'step': 2563, 'epoch': 2} {'type': 'loss', 'content': 0.26353394985198975, 'timestamp': '2025-09-05 09:08:59.183922', 'step': 2564, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:59.337378', 'step': 2564, 'epoch': 2} {'type': 'loss', 'content': 0.278327077627182, 'timestamp': '2025-09-05 09:08:59.339152', 'step': 2565, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:08:59.506890', 'step': 2565, 'epoch': 2} {'type': 'loss', 'content': 0.4738348126411438, 'timestamp': '2025-09-05 09:08:59.508929', 'step': 2566, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:08:59.666733', 'step': 2566, 'epoch': 2} {'type': 'loss', 'content': 0.22108004987239838, 'timestamp': '2025-09-05 09:08:59.668438', 'step': 2567, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:59.824336', 'step': 2567, 'epoch': 2} {'type': 'loss', 'content': 0.3641687333583832, 'timestamp': '2025-09-05 09:08:59.838720', 'step': 2568, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:08:59.997932', 'step': 2568, 'epoch': 2} {'type': 'loss', 'content': 0.2895580232143402, 'timestamp': '2025-09-05 09:09:00.000313', 'step': 2569, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:00.135261', 'step': 2569, 'epoch': 2} {'type': 'loss', 'content': 0.3097214698791504, 'timestamp': '2025-09-05 09:09:00.137099', 'step': 2570, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:00.304405', 'step': 2570, 'epoch': 2} {'type': 'loss', 'content': 0.3036538064479828, 'timestamp': '2025-09-05 09:09:00.306729', 'step': 2571, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:00.463817', 'step': 2571, 'epoch': 2} {'type': 'loss', 'content': 0.3194674849510193, 'timestamp': '2025-09-05 09:09:00.477410', 'step': 2572, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:00.628501', 'step': 2572, 'epoch': 2} {'type': 'loss', 'content': 0.3619299829006195, 'timestamp': '2025-09-05 09:09:00.630635', 'step': 2573, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:00.790157', 'step': 2573, 'epoch': 2} {'type': 'loss', 'content': 0.2202121913433075, 'timestamp': '2025-09-05 09:09:00.792091', 'step': 2574, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:00.948683', 'step': 2574, 'epoch': 2} {'type': 'loss', 'content': 0.2178782969713211, 'timestamp': '2025-09-05 09:09:00.950541', 'step': 2575, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:01.121052', 'step': 2575, 'epoch': 2} {'type': 'loss', 'content': 0.30588048696517944, 'timestamp': '2025-09-05 09:09:01.135032', 'step': 2576, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 09:09:01.288635', 'step': 2576, 'epoch': 2} {'type': 'loss', 'content': 0.252217561006546, 'timestamp': '2025-09-05 09:09:01.290940', 'step': 2577, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:09:01.460305', 'step': 2577, 'epoch': 2} {'type': 'loss', 'content': 0.3836561143398285, 'timestamp': '2025-09-05 09:09:01.462184', 'step': 2578, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:01.632772', 'step': 2578, 'epoch': 2} {'type': 'loss', 'content': 0.14467762410640717, 'timestamp': '2025-09-05 09:09:01.635088', 'step': 2579, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:01.772020', 'step': 2579, 'epoch': 2} {'type': 'loss', 'content': 0.36755555868148804, 'timestamp': '2025-09-05 09:09:01.788192', 'step': 2580, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:09:06.435373', 'step': 2580, 'epoch': 2} {'type': 'pplx', 'content': 54.81730763687925, 'timestamp': '2025-09-05 09:09:06.438007', 'step': 2580, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:06.570269', 'step': 2580, 'epoch': 2} {'type': 'loss', 'content': 0.3310108780860901, 'timestamp': '2025-09-05 09:09:06.572346', 'step': 2581, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:09:06.736690', 'step': 2581, 'epoch': 2} {'type': 'loss', 'content': 0.3351535499095917, 'timestamp': '2025-09-05 09:09:06.738837', 'step': 2582, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:06.902060', 'step': 2582, 'epoch': 2} {'type': 'loss', 'content': 0.3105543851852417, 'timestamp': '2025-09-05 09:09:06.904103', 'step': 2583, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:09:07.068138', 'step': 2583, 'epoch': 2} {'type': 'loss', 'content': 0.3798165023326874, 'timestamp': '2025-09-05 09:09:07.081927', 'step': 2584, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:07.239587', 'step': 2584, 'epoch': 2} {'type': 'loss', 'content': 0.2240927517414093, 'timestamp': '2025-09-05 09:09:07.241587', 'step': 2585, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:09:07.446448', 'step': 2585, 'epoch': 2} {'type': 'loss', 'content': 0.44238588213920593, 'timestamp': '2025-09-05 09:09:07.448922', 'step': 2586, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:07.617891', 'step': 2586, 'epoch': 2} {'type': 'loss', 'content': 0.3627106845378876, 'timestamp': '2025-09-05 09:09:07.620036', 'step': 2587, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 09:09:07.788432', 'step': 2587, 'epoch': 2} {'type': 'loss', 'content': 0.41372790932655334, 'timestamp': '2025-09-05 09:09:07.797290', 'step': 2588, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:07.933122', 'step': 2588, 'epoch': 2} {'type': 'loss', 'content': 0.1950422078371048, 'timestamp': '2025-09-05 09:09:07.935093', 'step': 2589, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:08.070991', 'step': 2589, 'epoch': 2} {'type': 'loss', 'content': 0.23546253144741058, 'timestamp': '2025-09-05 09:09:08.073472', 'step': 2590, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:08.251746', 'step': 2590, 'epoch': 2} {'type': 'loss', 'content': 0.24141792953014374, 'timestamp': '2025-09-05 09:09:08.254326', 'step': 2591, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:08.425452', 'step': 2591, 'epoch': 2} {'type': 'loss', 'content': 0.3847906291484833, 'timestamp': '2025-09-05 09:09:08.439722', 'step': 2592, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:08.634658', 'step': 2592, 'epoch': 2} {'type': 'loss', 'content': 0.34318065643310547, 'timestamp': '2025-09-05 09:09:08.637294', 'step': 2593, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:08.822224', 'step': 2593, 'epoch': 2} {'type': 'loss', 'content': 0.3513537645339966, 'timestamp': '2025-09-05 09:09:08.824692', 'step': 2594, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:09.010563', 'step': 2594, 'epoch': 2} {'type': 'loss', 'content': 0.2467866986989975, 'timestamp': '2025-09-05 09:09:09.014666', 'step': 2595, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:09:09.204328', 'step': 2595, 'epoch': 2} {'type': 'loss', 'content': 0.31491708755493164, 'timestamp': '2025-09-05 09:09:09.221207', 'step': 2596, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:09.387748', 'step': 2596, 'epoch': 2} {'type': 'loss', 'content': 0.42033904790878296, 'timestamp': '2025-09-05 09:09:09.390084', 'step': 2597, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:09.556120', 'step': 2597, 'epoch': 2} {'type': 'loss', 'content': 0.25425952672958374, 'timestamp': '2025-09-05 09:09:09.558324', 'step': 2598, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:09.721976', 'step': 2598, 'epoch': 2} {'type': 'loss', 'content': 0.2386419177055359, 'timestamp': '2025-09-05 09:09:09.724701', 'step': 2599, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:09.888119', 'step': 2599, 'epoch': 2} {'type': 'loss', 'content': 0.1672760248184204, 'timestamp': '2025-09-05 09:09:09.902009', 'step': 2600, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:09:14.531558', 'step': 2600, 'epoch': 2} {'type': 'pplx', 'content': 54.832467462479364, 'timestamp': '2025-09-05 09:09:14.533780', 'step': 2600, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 2600', 'timestamp': '2025-09-05 09:09:15.004154', 'step': 2600, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:15.143984', 'step': 2600, 'epoch': 3} {'type': 'loss', 'content': 0.2909306585788727, 'timestamp': '2025-09-05 09:09:15.145976', 'step': 2601, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:09:15.310389', 'step': 2601, 'epoch': 3} {'type': 'loss', 'content': 0.369361937046051, 'timestamp': '2025-09-05 09:09:15.312190', 'step': 2602, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:15.475409', 'step': 2602, 'epoch': 3} {'type': 'loss', 'content': 0.2690865099430084, 'timestamp': '2025-09-05 09:09:15.477319', 'step': 2603, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:15.649941', 'step': 2603, 'epoch': 3} {'type': 'loss', 'content': 0.41636621952056885, 'timestamp': '2025-09-05 09:09:15.664165', 'step': 2604, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:15.829025', 'step': 2604, 'epoch': 3} {'type': 'loss', 'content': 0.24433229863643646, 'timestamp': '2025-09-05 09:09:15.830945', 'step': 2605, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:15.996184', 'step': 2605, 'epoch': 3} {'type': 'loss', 'content': 0.3325082063674927, 'timestamp': '2025-09-05 09:09:15.998155', 'step': 2606, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:16.161457', 'step': 2606, 'epoch': 3} {'type': 'loss', 'content': 0.2946546971797943, 'timestamp': '2025-09-05 09:09:16.163593', 'step': 2607, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:16.327289', 'step': 2607, 'epoch': 3} {'type': 'loss', 'content': 0.46848660707473755, 'timestamp': '2025-09-05 09:09:16.341734', 'step': 2608, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:16.498726', 'step': 2608, 'epoch': 3} {'type': 'loss', 'content': 0.39881542325019836, 'timestamp': '2025-09-05 09:09:16.502038', 'step': 2609, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:16.669668', 'step': 2609, 'epoch': 3} {'type': 'loss', 'content': 0.3879721462726593, 'timestamp': '2025-09-05 09:09:16.671710', 'step': 2610, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:16.844414', 'step': 2610, 'epoch': 3} {'type': 'loss', 'content': 0.3810281455516815, 'timestamp': '2025-09-05 09:09:16.846276', 'step': 2611, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:17.020193', 'step': 2611, 'epoch': 3} {'type': 'loss', 'content': 0.41419529914855957, 'timestamp': '2025-09-05 09:09:17.034287', 'step': 2612, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:17.192268', 'step': 2612, 'epoch': 3} {'type': 'loss', 'content': 0.296019047498703, 'timestamp': '2025-09-05 09:09:17.195977', 'step': 2613, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:17.359277', 'step': 2613, 'epoch': 3} {'type': 'loss', 'content': 0.39310163259506226, 'timestamp': '2025-09-05 09:09:17.361128', 'step': 2614, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:17.519311', 'step': 2614, 'epoch': 3} {'type': 'loss', 'content': 0.2674983739852905, 'timestamp': '2025-09-05 09:09:17.521442', 'step': 2615, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:17.678767', 'step': 2615, 'epoch': 3} {'type': 'loss', 'content': 0.34225496649742126, 'timestamp': '2025-09-05 09:09:17.692054', 'step': 2616, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:17.843075', 'step': 2616, 'epoch': 3} {'type': 'loss', 'content': 0.36751917004585266, 'timestamp': '2025-09-05 09:09:17.845103', 'step': 2617, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:18.001589', 'step': 2617, 'epoch': 3} {'type': 'loss', 'content': 0.28645431995391846, 'timestamp': '2025-09-05 09:09:18.003740', 'step': 2618, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:09:18.163572', 'step': 2618, 'epoch': 3} {'type': 'loss', 'content': 0.27839794754981995, 'timestamp': '2025-09-05 09:09:18.165439', 'step': 2619, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:18.322425', 'step': 2619, 'epoch': 3} {'type': 'loss', 'content': 0.28111451864242554, 'timestamp': '2025-09-05 09:09:18.336168', 'step': 2620, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:09:22.963303', 'step': 2620, 'epoch': 3} {'type': 'pplx', 'content': 54.74175043223115, 'timestamp': '2025-09-05 09:09:22.965443', 'step': 2620, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:23.097523', 'step': 2620, 'epoch': 3} {'type': 'loss', 'content': 0.41243675351142883, 'timestamp': '2025-09-05 09:09:23.099641', 'step': 2621, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:09:23.236457', 'step': 2621, 'epoch': 3} {'type': 'loss', 'content': 0.4278426766395569, 'timestamp': '2025-09-05 09:09:23.238298', 'step': 2622, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:23.374088', 'step': 2622, 'epoch': 3} {'type': 'loss', 'content': 0.25342079997062683, 'timestamp': '2025-09-05 09:09:23.376267', 'step': 2623, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:09:23.513612', 'step': 2623, 'epoch': 3} {'type': 'loss', 'content': 0.4077974259853363, 'timestamp': '2025-09-05 09:09:23.522957', 'step': 2624, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:23.657452', 'step': 2624, 'epoch': 3} {'type': 'loss', 'content': 0.2912677526473999, 'timestamp': '2025-09-05 09:09:23.659379', 'step': 2625, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:23.794514', 'step': 2625, 'epoch': 3} {'type': 'loss', 'content': 0.28832995891571045, 'timestamp': '2025-09-05 09:09:23.796204', 'step': 2626, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:23.931743', 'step': 2626, 'epoch': 3} {'type': 'loss', 'content': 0.3055991530418396, 'timestamp': '2025-09-05 09:09:23.934138', 'step': 2627, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:24.103487', 'step': 2627, 'epoch': 3} {'type': 'loss', 'content': 0.3616314232349396, 'timestamp': '2025-09-05 09:09:24.118124', 'step': 2628, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:24.272435', 'step': 2628, 'epoch': 3} {'type': 'loss', 'content': 0.29930540919303894, 'timestamp': '2025-09-05 09:09:24.274499', 'step': 2629, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:24.432410', 'step': 2629, 'epoch': 3} {'type': 'loss', 'content': 0.27770620584487915, 'timestamp': '2025-09-05 09:09:24.434428', 'step': 2630, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:24.603178', 'step': 2630, 'epoch': 3} {'type': 'loss', 'content': 0.4240753948688507, 'timestamp': '2025-09-05 09:09:24.604981', 'step': 2631, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:24.739831', 'step': 2631, 'epoch': 3} {'type': 'loss', 'content': 0.2943328619003296, 'timestamp': '2025-09-05 09:09:24.755715', 'step': 2632, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:24.915341', 'step': 2632, 'epoch': 3} {'type': 'loss', 'content': 0.28002142906188965, 'timestamp': '2025-09-05 09:09:24.917749', 'step': 2633, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:25.074494', 'step': 2633, 'epoch': 3} {'type': 'loss', 'content': 0.3681841194629669, 'timestamp': '2025-09-05 09:09:25.076991', 'step': 2634, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:25.242111', 'step': 2634, 'epoch': 3} {'type': 'loss', 'content': 0.3582687973976135, 'timestamp': '2025-09-05 09:09:25.244392', 'step': 2635, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:25.403267', 'step': 2635, 'epoch': 3} {'type': 'loss', 'content': 0.38165122270584106, 'timestamp': '2025-09-05 09:09:25.417274', 'step': 2636, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:25.569052', 'step': 2636, 'epoch': 3} {'type': 'loss', 'content': 0.3990744352340698, 'timestamp': '2025-09-05 09:09:25.571190', 'step': 2637, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:25.733478', 'step': 2637, 'epoch': 3} {'type': 'loss', 'content': 0.33937230706214905, 'timestamp': '2025-09-05 09:09:25.735736', 'step': 2638, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:25.911457', 'step': 2638, 'epoch': 3} {'type': 'loss', 'content': 0.3162641227245331, 'timestamp': '2025-09-05 09:09:25.913737', 'step': 2639, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:26.072299', 'step': 2639, 'epoch': 3} {'type': 'loss', 'content': 0.2395915687084198, 'timestamp': '2025-09-05 09:09:26.088449', 'step': 2640, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:09:30.715372', 'step': 2640, 'epoch': 3} {'type': 'pplx', 'content': 54.28784188000765, 'timestamp': '2025-09-05 09:09:30.717835', 'step': 2640, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2640', 'timestamp': '2025-09-05 09:09:31.194816', 'step': 2640, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:31.341771', 'step': 2640, 'epoch': 3} {'type': 'loss', 'content': 0.3118651211261749, 'timestamp': '2025-09-05 09:09:31.344881', 'step': 2641, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:31.503551', 'step': 2641, 'epoch': 3} {'type': 'loss', 'content': 0.33012741804122925, 'timestamp': '2025-09-05 09:09:31.505717', 'step': 2642, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:31.662794', 'step': 2642, 'epoch': 3} {'type': 'loss', 'content': 0.376676470041275, 'timestamp': '2025-09-05 09:09:31.665311', 'step': 2643, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:31.833512', 'step': 2643, 'epoch': 3} {'type': 'loss', 'content': 0.30343472957611084, 'timestamp': '2025-09-05 09:09:31.847642', 'step': 2644, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:32.002431', 'step': 2644, 'epoch': 3} {'type': 'loss', 'content': 0.3581269681453705, 'timestamp': '2025-09-05 09:09:32.004501', 'step': 2645, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:32.173097', 'step': 2645, 'epoch': 3} {'type': 'loss', 'content': 0.22369900345802307, 'timestamp': '2025-09-05 09:09:32.174951', 'step': 2646, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:32.309856', 'step': 2646, 'epoch': 3} {'type': 'loss', 'content': 0.3513410985469818, 'timestamp': '2025-09-05 09:09:32.312072', 'step': 2647, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:32.481357', 'step': 2647, 'epoch': 3} {'type': 'loss', 'content': 0.44361498951911926, 'timestamp': '2025-09-05 09:09:32.495464', 'step': 2648, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:09:32.649538', 'step': 2648, 'epoch': 3} {'type': 'loss', 'content': 0.2907888889312744, 'timestamp': '2025-09-05 09:09:32.651742', 'step': 2649, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:32.808844', 'step': 2649, 'epoch': 3} {'type': 'loss', 'content': 0.37796589732170105, 'timestamp': '2025-09-05 09:09:32.810776', 'step': 2650, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:32.968897', 'step': 2650, 'epoch': 3} {'type': 'loss', 'content': 0.29424649477005005, 'timestamp': '2025-09-05 09:09:32.970994', 'step': 2651, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:33.129396', 'step': 2651, 'epoch': 3} {'type': 'loss', 'content': 0.36393117904663086, 'timestamp': '2025-09-05 09:09:33.145047', 'step': 2652, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:33.310349', 'step': 2652, 'epoch': 3} {'type': 'loss', 'content': 0.3564506769180298, 'timestamp': '2025-09-05 09:09:33.312321', 'step': 2653, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:33.478108', 'step': 2653, 'epoch': 3} {'type': 'loss', 'content': 0.20433595776557922, 'timestamp': '2025-09-05 09:09:33.479956', 'step': 2654, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:33.638178', 'step': 2654, 'epoch': 3} {'type': 'loss', 'content': 0.38941457867622375, 'timestamp': '2025-09-05 09:09:33.641092', 'step': 2655, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:33.798976', 'step': 2655, 'epoch': 3} {'type': 'loss', 'content': 0.276498019695282, 'timestamp': '2025-09-05 09:09:33.813114', 'step': 2656, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:33.964685', 'step': 2656, 'epoch': 3} {'type': 'loss', 'content': 0.22760221362113953, 'timestamp': '2025-09-05 09:09:33.966648', 'step': 2657, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:34.133765', 'step': 2657, 'epoch': 3} {'type': 'loss', 'content': 0.39622488617897034, 'timestamp': '2025-09-05 09:09:34.135682', 'step': 2658, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:09:34.293759', 'step': 2658, 'epoch': 3} {'type': 'loss', 'content': 0.3628631830215454, 'timestamp': '2025-09-05 09:09:34.295822', 'step': 2659, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:34.453730', 'step': 2659, 'epoch': 3} {'type': 'loss', 'content': 0.28753095865249634, 'timestamp': '2025-09-05 09:09:34.467476', 'step': 2660, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:09:39.114526', 'step': 2660, 'epoch': 3} {'type': 'pplx', 'content': 54.31525715361568, 'timestamp': '2025-09-05 09:09:39.116914', 'step': 2660, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:39.253139', 'step': 2660, 'epoch': 3} {'type': 'loss', 'content': 0.25859639048576355, 'timestamp': '2025-09-05 09:09:39.264102', 'step': 2661, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:39.404262', 'step': 2661, 'epoch': 3} {'type': 'loss', 'content': 0.23019303381443024, 'timestamp': '2025-09-05 09:09:39.407220', 'step': 2662, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:09:39.581353', 'step': 2662, 'epoch': 3} {'type': 'loss', 'content': 0.5155594348907471, 'timestamp': '2025-09-05 09:09:39.583174', 'step': 2663, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:39.741092', 'step': 2663, 'epoch': 3} {'type': 'loss', 'content': 0.37276023626327515, 'timestamp': '2025-09-05 09:09:39.755490', 'step': 2664, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:39.907403', 'step': 2664, 'epoch': 3} {'type': 'loss', 'content': 0.41703230142593384, 'timestamp': '2025-09-05 09:09:39.909321', 'step': 2665, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:40.066875', 'step': 2665, 'epoch': 3} {'type': 'loss', 'content': 0.2926308512687683, 'timestamp': '2025-09-05 09:09:40.068643', 'step': 2666, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:40.226508', 'step': 2666, 'epoch': 3} {'type': 'loss', 'content': 0.4091567099094391, 'timestamp': '2025-09-05 09:09:40.228193', 'step': 2667, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:40.397210', 'step': 2667, 'epoch': 3} {'type': 'loss', 'content': 0.26959991455078125, 'timestamp': '2025-09-05 09:09:40.414775', 'step': 2668, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:40.572190', 'step': 2668, 'epoch': 3} {'type': 'loss', 'content': 0.2950553894042969, 'timestamp': '2025-09-05 09:09:40.587593', 'step': 2669, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:40.769007', 'step': 2669, 'epoch': 3} {'type': 'loss', 'content': 0.35082414746284485, 'timestamp': '2025-09-05 09:09:40.771348', 'step': 2670, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:09:40.933475', 'step': 2670, 'epoch': 3} {'type': 'loss', 'content': 0.2740470767021179, 'timestamp': '2025-09-05 09:09:40.935596', 'step': 2671, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:41.106478', 'step': 2671, 'epoch': 3} {'type': 'loss', 'content': 0.3159100115299225, 'timestamp': '2025-09-05 09:09:41.115332', 'step': 2672, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:09:41.249508', 'step': 2672, 'epoch': 3} {'type': 'loss', 'content': 0.4648512303829193, 'timestamp': '2025-09-05 09:09:41.251877', 'step': 2673, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:09:41.409328', 'step': 2673, 'epoch': 3} {'type': 'loss', 'content': 0.3527366518974304, 'timestamp': '2025-09-05 09:09:41.411381', 'step': 2674, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:41.584798', 'step': 2674, 'epoch': 3} {'type': 'loss', 'content': 0.26771530508995056, 'timestamp': '2025-09-05 09:09:41.586705', 'step': 2675, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:41.745088', 'step': 2675, 'epoch': 3} {'type': 'loss', 'content': 0.34135565161705017, 'timestamp': '2025-09-05 09:09:41.761326', 'step': 2676, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:41.914539', 'step': 2676, 'epoch': 3} {'type': 'loss', 'content': 0.35681870579719543, 'timestamp': '2025-09-05 09:09:41.920362', 'step': 2677, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:42.083593', 'step': 2677, 'epoch': 3} {'type': 'loss', 'content': 0.22552332282066345, 'timestamp': '2025-09-05 09:09:42.085921', 'step': 2678, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:42.255612', 'step': 2678, 'epoch': 3} {'type': 'loss', 'content': 0.24503037333488464, 'timestamp': '2025-09-05 09:09:42.257451', 'step': 2679, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:42.394045', 'step': 2679, 'epoch': 3} {'type': 'loss', 'content': 0.228922039270401, 'timestamp': '2025-09-05 09:09:42.410435', 'step': 2680, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:09:47.123710', 'step': 2680, 'epoch': 3} {'type': 'pplx', 'content': 54.31896988097937, 'timestamp': '2025-09-05 09:09:47.126173', 'step': 2680, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2680', 'timestamp': '2025-09-05 09:09:47.641795', 'step': 2680, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:47.775868', 'step': 2680, 'epoch': 3} {'type': 'loss', 'content': 0.3755874037742615, 'timestamp': '2025-09-05 09:09:47.778167', 'step': 2681, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:47.915328', 'step': 2681, 'epoch': 3} {'type': 'loss', 'content': 0.30692851543426514, 'timestamp': '2025-09-05 09:09:47.917502', 'step': 2682, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:48.087264', 'step': 2682, 'epoch': 3} {'type': 'loss', 'content': 0.41206371784210205, 'timestamp': '2025-09-05 09:09:48.089321', 'step': 2683, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:48.249641', 'step': 2683, 'epoch': 3} {'type': 'loss', 'content': 0.38735464215278625, 'timestamp': '2025-09-05 09:09:48.263554', 'step': 2684, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:48.415430', 'step': 2684, 'epoch': 3} {'type': 'loss', 'content': 0.27017661929130554, 'timestamp': '2025-09-05 09:09:48.417668', 'step': 2685, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:48.553584', 'step': 2685, 'epoch': 3} {'type': 'loss', 'content': 0.3175680637359619, 'timestamp': '2025-09-05 09:09:48.555889', 'step': 2686, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:48.723796', 'step': 2686, 'epoch': 3} {'type': 'loss', 'content': 0.3680326044559479, 'timestamp': '2025-09-05 09:09:48.725538', 'step': 2687, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:09:48.885954', 'step': 2687, 'epoch': 3} {'type': 'loss', 'content': 0.29450035095214844, 'timestamp': '2025-09-05 09:09:48.900149', 'step': 2688, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:09:49.056183', 'step': 2688, 'epoch': 3} {'type': 'loss', 'content': 0.4046458303928375, 'timestamp': '2025-09-05 09:09:49.058643', 'step': 2689, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:49.216287', 'step': 2689, 'epoch': 3} {'type': 'loss', 'content': 0.3243006765842438, 'timestamp': '2025-09-05 09:09:49.218640', 'step': 2690, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:49.376779', 'step': 2690, 'epoch': 3} {'type': 'loss', 'content': 0.2823656499385834, 'timestamp': '2025-09-05 09:09:49.379245', 'step': 2691, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:49.557827', 'step': 2691, 'epoch': 3} {'type': 'loss', 'content': 0.3415341079235077, 'timestamp': '2025-09-05 09:09:49.574030', 'step': 2692, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:49.735573', 'step': 2692, 'epoch': 3} {'type': 'loss', 'content': 0.20649684965610504, 'timestamp': '2025-09-05 09:09:49.737926', 'step': 2693, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:49.897370', 'step': 2693, 'epoch': 3} {'type': 'loss', 'content': 0.41073697805404663, 'timestamp': '2025-09-05 09:09:49.902347', 'step': 2694, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:09:50.080020', 'step': 2694, 'epoch': 3} {'type': 'loss', 'content': 0.4066583514213562, 'timestamp': '2025-09-05 09:09:50.083816', 'step': 2695, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:50.257701', 'step': 2695, 'epoch': 3} {'type': 'loss', 'content': 0.43231305480003357, 'timestamp': '2025-09-05 09:09:50.271674', 'step': 2696, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:50.425120', 'step': 2696, 'epoch': 3} {'type': 'loss', 'content': 0.43370968103408813, 'timestamp': '2025-09-05 09:09:50.427885', 'step': 2697, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:50.587228', 'step': 2697, 'epoch': 3} {'type': 'loss', 'content': 0.3017624616622925, 'timestamp': '2025-09-05 09:09:50.589376', 'step': 2698, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:50.747465', 'step': 2698, 'epoch': 3} {'type': 'loss', 'content': 0.49601367115974426, 'timestamp': '2025-09-05 09:09:50.749698', 'step': 2699, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:50.919572', 'step': 2699, 'epoch': 3} {'type': 'loss', 'content': 0.3051532208919525, 'timestamp': '2025-09-05 09:09:50.934651', 'step': 2700, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:09:55.581191', 'step': 2700, 'epoch': 3} {'type': 'pplx', 'content': 54.71914685947849, 'timestamp': '2025-09-05 09:09:55.583324', 'step': 2700, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:55.715082', 'step': 2700, 'epoch': 3} {'type': 'loss', 'content': 0.24363669753074646, 'timestamp': '2025-09-05 09:09:55.717265', 'step': 2701, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:09:55.873487', 'step': 2701, 'epoch': 3} {'type': 'loss', 'content': 0.3039955198764801, 'timestamp': '2025-09-05 09:09:55.875289', 'step': 2702, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:56.042793', 'step': 2702, 'epoch': 3} {'type': 'loss', 'content': 0.3395662307739258, 'timestamp': '2025-09-05 09:09:56.044668', 'step': 2703, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:56.180369', 'step': 2703, 'epoch': 3} {'type': 'loss', 'content': 0.23626942932605743, 'timestamp': '2025-09-05 09:09:56.196460', 'step': 2704, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:56.356335', 'step': 2704, 'epoch': 3} {'type': 'loss', 'content': 0.3257940411567688, 'timestamp': '2025-09-05 09:09:56.358374', 'step': 2705, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:56.516397', 'step': 2705, 'epoch': 3} {'type': 'loss', 'content': 0.3324272930622101, 'timestamp': '2025-09-05 09:09:56.519005', 'step': 2706, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:56.658125', 'step': 2706, 'epoch': 3} {'type': 'loss', 'content': 0.18577173352241516, 'timestamp': '2025-09-05 09:09:56.661453', 'step': 2707, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:56.843395', 'step': 2707, 'epoch': 3} {'type': 'loss', 'content': 0.3146699070930481, 'timestamp': '2025-09-05 09:09:56.860248', 'step': 2708, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:09:57.033006', 'step': 2708, 'epoch': 3} {'type': 'loss', 'content': 0.23967741429805756, 'timestamp': '2025-09-05 09:09:57.036268', 'step': 2709, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:57.212345', 'step': 2709, 'epoch': 3} {'type': 'loss', 'content': 0.23511698842048645, 'timestamp': '2025-09-05 09:09:57.214817', 'step': 2710, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:57.392545', 'step': 2710, 'epoch': 3} {'type': 'loss', 'content': 0.28559327125549316, 'timestamp': '2025-09-05 09:09:57.394953', 'step': 2711, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:57.569750', 'step': 2711, 'epoch': 3} {'type': 'loss', 'content': 0.5006580352783203, 'timestamp': '2025-09-05 09:09:57.584025', 'step': 2712, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:57.742993', 'step': 2712, 'epoch': 3} {'type': 'loss', 'content': 0.2922325134277344, 'timestamp': '2025-09-05 09:09:57.751655', 'step': 2713, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:09:57.922784', 'step': 2713, 'epoch': 3} {'type': 'loss', 'content': 0.17340846359729767, 'timestamp': '2025-09-05 09:09:57.930195', 'step': 2714, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:09:58.110260', 'step': 2714, 'epoch': 3} {'type': 'loss', 'content': 0.22841380536556244, 'timestamp': '2025-09-05 09:09:58.113975', 'step': 2715, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:58.276525', 'step': 2715, 'epoch': 3} {'type': 'loss', 'content': 0.3006599247455597, 'timestamp': '2025-09-05 09:09:58.293741', 'step': 2716, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:09:58.459743', 'step': 2716, 'epoch': 3} {'type': 'loss', 'content': 0.3553715646266937, 'timestamp': '2025-09-05 09:09:58.464739', 'step': 2717, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:58.636391', 'step': 2717, 'epoch': 3} {'type': 'loss', 'content': 0.31938436627388, 'timestamp': '2025-09-05 09:09:58.639450', 'step': 2718, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:09:58.819436', 'step': 2718, 'epoch': 3} {'type': 'loss', 'content': 0.22373679280281067, 'timestamp': '2025-09-05 09:09:58.822596', 'step': 2719, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:09:59.011310', 'step': 2719, 'epoch': 3} {'type': 'loss', 'content': 0.20499032735824585, 'timestamp': '2025-09-05 09:09:59.033254', 'step': 2720, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:10:03.869184', 'step': 2720, 'epoch': 3} {'type': 'pplx', 'content': 55.399467890917855, 'timestamp': '2025-09-05 09:10:03.871173', 'step': 2720, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2720', 'timestamp': '2025-09-05 09:10:04.314045', 'step': 2720, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:04.453237', 'step': 2720, 'epoch': 3} {'type': 'loss', 'content': 0.3693501055240631, 'timestamp': '2025-09-05 09:10:04.455637', 'step': 2721, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:04.623810', 'step': 2721, 'epoch': 3} {'type': 'loss', 'content': 0.2824878990650177, 'timestamp': '2025-09-05 09:10:04.627198', 'step': 2722, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:10:04.787875', 'step': 2722, 'epoch': 3} {'type': 'loss', 'content': 0.307536244392395, 'timestamp': '2025-09-05 09:10:04.790194', 'step': 2723, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:10:04.950797', 'step': 2723, 'epoch': 3} {'type': 'loss', 'content': 0.2077270895242691, 'timestamp': '2025-09-05 09:10:04.959505', 'step': 2724, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:05.097999', 'step': 2724, 'epoch': 3} {'type': 'loss', 'content': 0.2842904031276703, 'timestamp': '2025-09-05 09:10:05.102783', 'step': 2725, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:05.262814', 'step': 2725, 'epoch': 3} {'type': 'loss', 'content': 0.30767548084259033, 'timestamp': '2025-09-05 09:10:05.266837', 'step': 2726, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:05.426245', 'step': 2726, 'epoch': 3} {'type': 'loss', 'content': 0.41483354568481445, 'timestamp': '2025-09-05 09:10:05.429925', 'step': 2727, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:05.592194', 'step': 2727, 'epoch': 3} {'type': 'loss', 'content': 0.2677007019519806, 'timestamp': '2025-09-05 09:10:05.606152', 'step': 2728, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:05.761691', 'step': 2728, 'epoch': 3} {'type': 'loss', 'content': 0.3418111801147461, 'timestamp': '2025-09-05 09:10:05.763999', 'step': 2729, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:05.901368', 'step': 2729, 'epoch': 3} {'type': 'loss', 'content': 0.26328641176223755, 'timestamp': '2025-09-05 09:10:05.905885', 'step': 2730, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:06.077778', 'step': 2730, 'epoch': 3} {'type': 'loss', 'content': 0.287534236907959, 'timestamp': '2025-09-05 09:10:06.079870', 'step': 2731, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:06.218060', 'step': 2731, 'epoch': 3} {'type': 'loss', 'content': 0.23290354013442993, 'timestamp': '2025-09-05 09:10:06.232682', 'step': 2732, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:10:06.387980', 'step': 2732, 'epoch': 3} {'type': 'loss', 'content': 0.44484513998031616, 'timestamp': '2025-09-05 09:10:06.390645', 'step': 2733, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:06.560661', 'step': 2733, 'epoch': 3} {'type': 'loss', 'content': 0.3004152774810791, 'timestamp': '2025-09-05 09:10:06.565345', 'step': 2734, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:06.728713', 'step': 2734, 'epoch': 3} {'type': 'loss', 'content': 0.23439401388168335, 'timestamp': '2025-09-05 09:10:06.731721', 'step': 2735, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:06.906635', 'step': 2735, 'epoch': 3} {'type': 'loss', 'content': 0.39753445982933044, 'timestamp': '2025-09-05 09:10:06.921311', 'step': 2736, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:07.081558', 'step': 2736, 'epoch': 3} {'type': 'loss', 'content': 0.26311197876930237, 'timestamp': '2025-09-05 09:10:07.084292', 'step': 2737, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:07.246437', 'step': 2737, 'epoch': 3} {'type': 'loss', 'content': 0.2834315896034241, 'timestamp': '2025-09-05 09:10:07.249366', 'step': 2738, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:07.421327', 'step': 2738, 'epoch': 3} {'type': 'loss', 'content': 0.35053861141204834, 'timestamp': '2025-09-05 09:10:07.424016', 'step': 2739, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:07.601132', 'step': 2739, 'epoch': 3} {'type': 'loss', 'content': 0.2513752579689026, 'timestamp': '2025-09-05 09:10:07.618283', 'step': 2740, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:10:12.409186', 'step': 2740, 'epoch': 3} {'type': 'pplx', 'content': 55.606292247536096, 'timestamp': '2025-09-05 09:10:12.416713', 'step': 2740, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:12.551578', 'step': 2740, 'epoch': 3} {'type': 'loss', 'content': 0.2897755801677704, 'timestamp': '2025-09-05 09:10:12.558344', 'step': 2741, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:12.698223', 'step': 2741, 'epoch': 3} {'type': 'loss', 'content': 0.3819401264190674, 'timestamp': '2025-09-05 09:10:12.701247', 'step': 2742, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:12.841144', 'step': 2742, 'epoch': 3} {'type': 'loss', 'content': 0.3809162378311157, 'timestamp': '2025-09-05 09:10:12.847553', 'step': 2743, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:10:12.986801', 'step': 2743, 'epoch': 3} {'type': 'loss', 'content': 0.27368873357772827, 'timestamp': '2025-09-05 09:10:12.996377', 'step': 2744, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:13.135351', 'step': 2744, 'epoch': 3} {'type': 'loss', 'content': 0.33143290877342224, 'timestamp': '2025-09-05 09:10:13.144607', 'step': 2745, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:13.288483', 'step': 2745, 'epoch': 3} {'type': 'loss', 'content': 0.23653104901313782, 'timestamp': '2025-09-05 09:10:13.293088', 'step': 2746, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:13.468711', 'step': 2746, 'epoch': 3} {'type': 'loss', 'content': 0.37218546867370605, 'timestamp': '2025-09-05 09:10:13.471453', 'step': 2747, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:13.645855', 'step': 2747, 'epoch': 3} {'type': 'loss', 'content': 0.27283674478530884, 'timestamp': '2025-09-05 09:10:13.662936', 'step': 2748, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:10:13.822337', 'step': 2748, 'epoch': 3} {'type': 'loss', 'content': 0.4320858120918274, 'timestamp': '2025-09-05 09:10:13.825185', 'step': 2749, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:14.000505', 'step': 2749, 'epoch': 3} {'type': 'loss', 'content': 0.2452457696199417, 'timestamp': '2025-09-05 09:10:14.014447', 'step': 2750, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:14.182660', 'step': 2750, 'epoch': 3} {'type': 'loss', 'content': 0.1904885619878769, 'timestamp': '2025-09-05 09:10:14.185589', 'step': 2751, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:14.350431', 'step': 2751, 'epoch': 3} {'type': 'loss', 'content': 0.3453499376773834, 'timestamp': '2025-09-05 09:10:14.365541', 'step': 2752, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:14.524726', 'step': 2752, 'epoch': 3} {'type': 'loss', 'content': 0.267494261264801, 'timestamp': '2025-09-05 09:10:14.528420', 'step': 2753, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:14.716589', 'step': 2753, 'epoch': 3} {'type': 'loss', 'content': 0.43331682682037354, 'timestamp': '2025-09-05 09:10:14.720911', 'step': 2754, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:14.901813', 'step': 2754, 'epoch': 3} {'type': 'loss', 'content': 0.3932355046272278, 'timestamp': '2025-09-05 09:10:14.905171', 'step': 2755, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:15.085460', 'step': 2755, 'epoch': 3} {'type': 'loss', 'content': 0.3386424779891968, 'timestamp': '2025-09-05 09:10:15.102279', 'step': 2756, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:15.279114', 'step': 2756, 'epoch': 3} {'type': 'loss', 'content': 0.3107830584049225, 'timestamp': '2025-09-05 09:10:15.282876', 'step': 2757, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:15.461908', 'step': 2757, 'epoch': 3} {'type': 'loss', 'content': 0.2651340961456299, 'timestamp': '2025-09-05 09:10:15.465681', 'step': 2758, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:10:15.630422', 'step': 2758, 'epoch': 3} {'type': 'loss', 'content': 0.4322452247142792, 'timestamp': '2025-09-05 09:10:15.639175', 'step': 2759, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:15.802596', 'step': 2759, 'epoch': 3} {'type': 'loss', 'content': 0.3348475694656372, 'timestamp': '2025-09-05 09:10:15.819849', 'step': 2760, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:10:20.767860', 'step': 2760, 'epoch': 3} {'type': 'pplx', 'content': 55.56754759928199, 'timestamp': '2025-09-05 09:10:20.772035', 'step': 2760, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2760', 'timestamp': '2025-09-05 09:10:21.365515', 'step': 2760, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:21.595230', 'step': 2760, 'epoch': 3} {'type': 'loss', 'content': 0.23760850727558136, 'timestamp': '2025-09-05 09:10:21.602522', 'step': 2761, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:21.765693', 'step': 2761, 'epoch': 3} {'type': 'loss', 'content': 0.3854387402534485, 'timestamp': '2025-09-05 09:10:21.776647', 'step': 2762, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:10:21.940738', 'step': 2762, 'epoch': 3} {'type': 'loss', 'content': 0.3008959889411926, 'timestamp': '2025-09-05 09:10:21.944579', 'step': 2763, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:22.108064', 'step': 2763, 'epoch': 3} {'type': 'loss', 'content': 0.2746279537677765, 'timestamp': '2025-09-05 09:10:22.124956', 'step': 2764, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:22.294008', 'step': 2764, 'epoch': 3} {'type': 'loss', 'content': 0.28841450810432434, 'timestamp': '2025-09-05 09:10:22.296980', 'step': 2765, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:22.472863', 'step': 2765, 'epoch': 3} {'type': 'loss', 'content': 0.20862191915512085, 'timestamp': '2025-09-05 09:10:22.475833', 'step': 2766, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:22.639204', 'step': 2766, 'epoch': 3} {'type': 'loss', 'content': 0.2936626076698303, 'timestamp': '2025-09-05 09:10:22.648560', 'step': 2767, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:22.821191', 'step': 2767, 'epoch': 3} {'type': 'loss', 'content': 0.3055102825164795, 'timestamp': '2025-09-05 09:10:22.846361', 'step': 2768, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:23.025985', 'step': 2768, 'epoch': 3} {'type': 'loss', 'content': 0.42737877368927, 'timestamp': '2025-09-05 09:10:23.037055', 'step': 2769, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:23.210901', 'step': 2769, 'epoch': 3} {'type': 'loss', 'content': 0.28168973326683044, 'timestamp': '2025-09-05 09:10:23.214976', 'step': 2770, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:23.404027', 'step': 2770, 'epoch': 3} {'type': 'loss', 'content': 0.4235506057739258, 'timestamp': '2025-09-05 09:10:23.407613', 'step': 2771, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:10:23.588220', 'step': 2771, 'epoch': 3} {'type': 'loss', 'content': 0.28592631220817566, 'timestamp': '2025-09-05 09:10:23.605025', 'step': 2772, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:10:23.773596', 'step': 2772, 'epoch': 3} {'type': 'loss', 'content': 0.2634555995464325, 'timestamp': '2025-09-05 09:10:23.776889', 'step': 2773, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:23.945473', 'step': 2773, 'epoch': 3} {'type': 'loss', 'content': 0.21984124183654785, 'timestamp': '2025-09-05 09:10:23.947745', 'step': 2774, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:24.086416', 'step': 2774, 'epoch': 3} {'type': 'loss', 'content': 0.20653694868087769, 'timestamp': '2025-09-05 09:10:24.090554', 'step': 2775, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:24.259486', 'step': 2775, 'epoch': 3} {'type': 'loss', 'content': 0.2279079109430313, 'timestamp': '2025-09-05 09:10:24.275952', 'step': 2776, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:24.442371', 'step': 2776, 'epoch': 3} {'type': 'loss', 'content': 0.38293153047561646, 'timestamp': '2025-09-05 09:10:24.447664', 'step': 2777, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:24.631220', 'step': 2777, 'epoch': 3} {'type': 'loss', 'content': 0.2632295787334442, 'timestamp': '2025-09-05 09:10:24.634597', 'step': 2778, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:24.809574', 'step': 2778, 'epoch': 3} {'type': 'loss', 'content': 0.3497149348258972, 'timestamp': '2025-09-05 09:10:24.812800', 'step': 2779, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:24.977713', 'step': 2779, 'epoch': 3} {'type': 'loss', 'content': 0.3435024619102478, 'timestamp': '2025-09-05 09:10:24.995774', 'step': 2780, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:10:29.786411', 'step': 2780, 'epoch': 3} {'type': 'pplx', 'content': 55.502120442422544, 'timestamp': '2025-09-05 09:10:29.790346', 'step': 2780, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:10:29.922857', 'step': 2780, 'epoch': 3} {'type': 'loss', 'content': 0.2520749866962433, 'timestamp': '2025-09-05 09:10:29.932096', 'step': 2781, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:30.071586', 'step': 2781, 'epoch': 3} {'type': 'loss', 'content': 0.2531195282936096, 'timestamp': '2025-09-05 09:10:30.074133', 'step': 2782, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:10:30.213336', 'step': 2782, 'epoch': 3} {'type': 'loss', 'content': 0.41319963335990906, 'timestamp': '2025-09-05 09:10:30.218550', 'step': 2783, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:30.360495', 'step': 2783, 'epoch': 3} {'type': 'loss', 'content': 0.25934308767318726, 'timestamp': '2025-09-05 09:10:30.376438', 'step': 2784, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:30.514182', 'step': 2784, 'epoch': 3} {'type': 'loss', 'content': 0.16343851387500763, 'timestamp': '2025-09-05 09:10:30.517315', 'step': 2785, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:30.657243', 'step': 2785, 'epoch': 3} {'type': 'loss', 'content': 0.34972044825553894, 'timestamp': '2025-09-05 09:10:30.660463', 'step': 2786, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:30.800154', 'step': 2786, 'epoch': 3} {'type': 'loss', 'content': 0.3237220346927643, 'timestamp': '2025-09-05 09:10:30.806393', 'step': 2787, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:30.948423', 'step': 2787, 'epoch': 3} {'type': 'loss', 'content': 0.2645550072193146, 'timestamp': '2025-09-05 09:10:30.959681', 'step': 2788, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:31.095759', 'step': 2788, 'epoch': 3} {'type': 'loss', 'content': 0.3123627305030823, 'timestamp': '2025-09-05 09:10:31.099566', 'step': 2789, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:31.249554', 'step': 2789, 'epoch': 3} {'type': 'loss', 'content': 0.2884657680988312, 'timestamp': '2025-09-05 09:10:31.252646', 'step': 2790, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:31.431942', 'step': 2790, 'epoch': 3} {'type': 'loss', 'content': 0.22808344662189484, 'timestamp': '2025-09-05 09:10:31.434481', 'step': 2791, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:31.608196', 'step': 2791, 'epoch': 3} {'type': 'loss', 'content': 0.4153204560279846, 'timestamp': '2025-09-05 09:10:31.627058', 'step': 2792, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:31.788114', 'step': 2792, 'epoch': 3} {'type': 'loss', 'content': 0.24033991992473602, 'timestamp': '2025-09-05 09:10:31.790509', 'step': 2793, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:31.966073', 'step': 2793, 'epoch': 3} {'type': 'loss', 'content': 0.2547632157802582, 'timestamp': '2025-09-05 09:10:31.970354', 'step': 2794, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:32.145056', 'step': 2794, 'epoch': 3} {'type': 'loss', 'content': 0.3763968050479889, 'timestamp': '2025-09-05 09:10:32.147688', 'step': 2795, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:32.310468', 'step': 2795, 'epoch': 3} {'type': 'loss', 'content': 0.4241531491279602, 'timestamp': '2025-09-05 09:10:32.324920', 'step': 2796, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:32.478476', 'step': 2796, 'epoch': 3} {'type': 'loss', 'content': 0.2867893874645233, 'timestamp': '2025-09-05 09:10:32.481606', 'step': 2797, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:32.645290', 'step': 2797, 'epoch': 3} {'type': 'loss', 'content': 0.2623746991157532, 'timestamp': '2025-09-05 09:10:32.650158', 'step': 2798, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:32.829402', 'step': 2798, 'epoch': 3} {'type': 'loss', 'content': 0.353633314371109, 'timestamp': '2025-09-05 09:10:32.832494', 'step': 2799, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:33.007499', 'step': 2799, 'epoch': 3} {'type': 'loss', 'content': 0.3284893035888672, 'timestamp': '2025-09-05 09:10:33.023668', 'step': 2800, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:10:37.838708', 'step': 2800, 'epoch': 3} {'type': 'pplx', 'content': 56.313607306551525, 'timestamp': '2025-09-05 09:10:37.840865', 'step': 2800, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2800', 'timestamp': '2025-09-05 09:10:38.386248', 'step': 2800, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:38.566979', 'step': 2800, 'epoch': 3} {'type': 'loss', 'content': 0.2466062307357788, 'timestamp': '2025-09-05 09:10:38.574310', 'step': 2801, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:38.797138', 'step': 2801, 'epoch': 3} {'type': 'loss', 'content': 0.297829270362854, 'timestamp': '2025-09-05 09:10:38.801752', 'step': 2802, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:39.027938', 'step': 2802, 'epoch': 3} {'type': 'loss', 'content': 0.3962607681751251, 'timestamp': '2025-09-05 09:10:39.037773', 'step': 2803, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:39.256098', 'step': 2803, 'epoch': 3} {'type': 'loss', 'content': 0.23827433586120605, 'timestamp': '2025-09-05 09:10:39.273813', 'step': 2804, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:10:39.486250', 'step': 2804, 'epoch': 3} {'type': 'loss', 'content': 0.34505435824394226, 'timestamp': '2025-09-05 09:10:39.498012', 'step': 2805, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:39.701452', 'step': 2805, 'epoch': 3} {'type': 'loss', 'content': 0.17793166637420654, 'timestamp': '2025-09-05 09:10:39.706284', 'step': 2806, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:39.936500', 'step': 2806, 'epoch': 3} {'type': 'loss', 'content': 0.387498140335083, 'timestamp': '2025-09-05 09:10:39.943246', 'step': 2807, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:40.141959', 'step': 2807, 'epoch': 3} {'type': 'loss', 'content': 0.25481945276260376, 'timestamp': '2025-09-05 09:10:40.161190', 'step': 2808, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:40.369536', 'step': 2808, 'epoch': 3} {'type': 'loss', 'content': 0.22373159229755402, 'timestamp': '2025-09-05 09:10:40.373589', 'step': 2809, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:40.584720', 'step': 2809, 'epoch': 3} {'type': 'loss', 'content': 0.2652718126773834, 'timestamp': '2025-09-05 09:10:40.592769', 'step': 2810, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:40.811016', 'step': 2810, 'epoch': 3} {'type': 'loss', 'content': 0.24120259284973145, 'timestamp': '2025-09-05 09:10:40.821878', 'step': 2811, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:40.994024', 'step': 2811, 'epoch': 3} {'type': 'loss', 'content': 0.29725131392478943, 'timestamp': '2025-09-05 09:10:41.010223', 'step': 2812, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:41.187657', 'step': 2812, 'epoch': 3} {'type': 'loss', 'content': 0.24363994598388672, 'timestamp': '2025-09-05 09:10:41.193535', 'step': 2813, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:41.369512', 'step': 2813, 'epoch': 3} {'type': 'loss', 'content': 0.29551854729652405, 'timestamp': '2025-09-05 09:10:41.374421', 'step': 2814, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:10:41.645614', 'step': 2814, 'epoch': 3} {'type': 'loss', 'content': 0.3614238202571869, 'timestamp': '2025-09-05 09:10:41.649105', 'step': 2815, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:41.827736', 'step': 2815, 'epoch': 3} {'type': 'loss', 'content': 0.4348675310611725, 'timestamp': '2025-09-05 09:10:41.847192', 'step': 2816, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:42.024160', 'step': 2816, 'epoch': 3} {'type': 'loss', 'content': 0.26274287700653076, 'timestamp': '2025-09-05 09:10:42.026226', 'step': 2817, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:42.201657', 'step': 2817, 'epoch': 3} {'type': 'loss', 'content': 0.28489479422569275, 'timestamp': '2025-09-05 09:10:42.204164', 'step': 2818, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:42.382200', 'step': 2818, 'epoch': 3} {'type': 'loss', 'content': 0.3202391564846039, 'timestamp': '2025-09-05 09:10:42.385910', 'step': 2819, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:42.566485', 'step': 2819, 'epoch': 3} {'type': 'loss', 'content': 0.31756865978240967, 'timestamp': '2025-09-05 09:10:42.583875', 'step': 2820, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:10:48.047056', 'step': 2820, 'epoch': 3} {'type': 'pplx', 'content': 57.04747695124036, 'timestamp': '2025-09-05 09:10:48.051916', 'step': 2820, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:48.232338', 'step': 2820, 'epoch': 3} {'type': 'loss', 'content': 0.2091667205095291, 'timestamp': '2025-09-05 09:10:48.235792', 'step': 2821, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:48.446456', 'step': 2821, 'epoch': 3} {'type': 'loss', 'content': 0.39467981457710266, 'timestamp': '2025-09-05 09:10:48.449068', 'step': 2822, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:48.647493', 'step': 2822, 'epoch': 3} {'type': 'loss', 'content': 0.2453453540802002, 'timestamp': '2025-09-05 09:10:48.651805', 'step': 2823, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:48.851775', 'step': 2823, 'epoch': 3} {'type': 'loss', 'content': 0.3213421404361725, 'timestamp': '2025-09-05 09:10:48.870243', 'step': 2824, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:49.067797', 'step': 2824, 'epoch': 3} {'type': 'loss', 'content': 0.360073447227478, 'timestamp': '2025-09-05 09:10:49.071550', 'step': 2825, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:49.288504', 'step': 2825, 'epoch': 3} {'type': 'loss', 'content': 0.3350084125995636, 'timestamp': '2025-09-05 09:10:49.293974', 'step': 2826, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:49.517532', 'step': 2826, 'epoch': 3} {'type': 'loss', 'content': 0.2850082218647003, 'timestamp': '2025-09-05 09:10:49.521509', 'step': 2827, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:49.734675', 'step': 2827, 'epoch': 3} {'type': 'loss', 'content': 0.24017834663391113, 'timestamp': '2025-09-05 09:10:49.750677', 'step': 2828, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:10:49.956360', 'step': 2828, 'epoch': 3} {'type': 'loss', 'content': 0.38482263684272766, 'timestamp': '2025-09-05 09:10:49.958550', 'step': 2829, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:50.165235', 'step': 2829, 'epoch': 3} {'type': 'loss', 'content': 0.382106751203537, 'timestamp': '2025-09-05 09:10:50.169821', 'step': 2830, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:50.376138', 'step': 2830, 'epoch': 3} {'type': 'loss', 'content': 0.2114604413509369, 'timestamp': '2025-09-05 09:10:50.378830', 'step': 2831, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:50.589159', 'step': 2831, 'epoch': 3} {'type': 'loss', 'content': 0.2949742078781128, 'timestamp': '2025-09-05 09:10:50.605645', 'step': 2832, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:50.844757', 'step': 2832, 'epoch': 3} {'type': 'loss', 'content': 0.19070105254650116, 'timestamp': '2025-09-05 09:10:50.847226', 'step': 2833, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:51.046271', 'step': 2833, 'epoch': 3} {'type': 'loss', 'content': 0.23265816271305084, 'timestamp': '2025-09-05 09:10:51.049494', 'step': 2834, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:51.214113', 'step': 2834, 'epoch': 3} {'type': 'loss', 'content': 0.3064287602901459, 'timestamp': '2025-09-05 09:10:51.221071', 'step': 2835, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:51.431369', 'step': 2835, 'epoch': 3} {'type': 'loss', 'content': 0.33445465564727783, 'timestamp': '2025-09-05 09:10:51.446234', 'step': 2836, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:51.644763', 'step': 2836, 'epoch': 3} {'type': 'loss', 'content': 0.2877272665500641, 'timestamp': '2025-09-05 09:10:51.646928', 'step': 2837, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:10:51.846453', 'step': 2837, 'epoch': 3} {'type': 'loss', 'content': 0.2460276037454605, 'timestamp': '2025-09-05 09:10:51.851098', 'step': 2838, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:52.053185', 'step': 2838, 'epoch': 3} {'type': 'loss', 'content': 0.47292837500572205, 'timestamp': '2025-09-05 09:10:52.056120', 'step': 2839, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:52.253361', 'step': 2839, 'epoch': 3} {'type': 'loss', 'content': 0.23295848071575165, 'timestamp': '2025-09-05 09:10:52.272148', 'step': 2840, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:10:57.469162', 'step': 2840, 'epoch': 3} {'type': 'pplx', 'content': 56.21594373110914, 'timestamp': '2025-09-05 09:10:57.473155', 'step': 2840, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2840', 'timestamp': '2025-09-05 09:10:57.984109', 'step': 2840, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:58.122877', 'step': 2840, 'epoch': 3} {'type': 'loss', 'content': 0.31002530455589294, 'timestamp': '2025-09-05 09:10:58.127195', 'step': 2841, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:58.305422', 'step': 2841, 'epoch': 3} {'type': 'loss', 'content': 0.28681862354278564, 'timestamp': '2025-09-05 09:10:58.350144', 'step': 2842, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:58.535226', 'step': 2842, 'epoch': 3} {'type': 'loss', 'content': 0.44048944115638733, 'timestamp': '2025-09-05 09:10:58.576570', 'step': 2843, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:58.866502', 'step': 2843, 'epoch': 3} {'type': 'loss', 'content': 0.4286167025566101, 'timestamp': '2025-09-05 09:10:58.881406', 'step': 2844, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:10:59.144763', 'step': 2844, 'epoch': 3} {'type': 'loss', 'content': 0.28463494777679443, 'timestamp': '2025-09-05 09:10:59.149799', 'step': 2845, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:10:59.319538', 'step': 2845, 'epoch': 3} {'type': 'loss', 'content': 0.2371792048215866, 'timestamp': '2025-09-05 09:10:59.321865', 'step': 2846, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:59.494662', 'step': 2846, 'epoch': 3} {'type': 'loss', 'content': 0.27211883664131165, 'timestamp': '2025-09-05 09:10:59.525047', 'step': 2847, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:10:59.758416', 'step': 2847, 'epoch': 3} {'type': 'loss', 'content': 0.2545848488807678, 'timestamp': '2025-09-05 09:10:59.776991', 'step': 2848, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:10:59.931962', 'step': 2848, 'epoch': 3} {'type': 'loss', 'content': 0.24429751932621002, 'timestamp': '2025-09-05 09:10:59.935000', 'step': 2849, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:00.096442', 'step': 2849, 'epoch': 3} {'type': 'loss', 'content': 0.20609146356582642, 'timestamp': '2025-09-05 09:11:00.099578', 'step': 2850, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:00.262502', 'step': 2850, 'epoch': 3} {'type': 'loss', 'content': 0.274873286485672, 'timestamp': '2025-09-05 09:11:00.306226', 'step': 2851, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:11:00.478179', 'step': 2851, 'epoch': 3} {'type': 'loss', 'content': 0.24983830749988556, 'timestamp': '2025-09-05 09:11:00.501042', 'step': 2852, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:00.666751', 'step': 2852, 'epoch': 3} {'type': 'loss', 'content': 0.2654348611831665, 'timestamp': '2025-09-05 09:11:00.670801', 'step': 2853, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:00.854654', 'step': 2853, 'epoch': 3} {'type': 'loss', 'content': 0.2965334951877594, 'timestamp': '2025-09-05 09:11:00.857254', 'step': 2854, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:11:01.111321', 'step': 2854, 'epoch': 3} {'type': 'loss', 'content': 0.5099512338638306, 'timestamp': '2025-09-05 09:11:01.114036', 'step': 2855, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:11:01.289857', 'step': 2855, 'epoch': 3} {'type': 'loss', 'content': 0.27999117970466614, 'timestamp': '2025-09-05 09:11:01.305114', 'step': 2856, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:11:01.460920', 'step': 2856, 'epoch': 3} {'type': 'loss', 'content': 0.4357217848300934, 'timestamp': '2025-09-05 09:11:01.463101', 'step': 2857, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:01.637596', 'step': 2857, 'epoch': 3} {'type': 'loss', 'content': 0.304353266954422, 'timestamp': '2025-09-05 09:11:01.640459', 'step': 2858, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:01.867325', 'step': 2858, 'epoch': 3} {'type': 'loss', 'content': 0.3254320025444031, 'timestamp': '2025-09-05 09:11:01.870492', 'step': 2859, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:11:02.041130', 'step': 2859, 'epoch': 3} {'type': 'loss', 'content': 0.42343053221702576, 'timestamp': '2025-09-05 09:11:02.056588', 'step': 2860, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:11:07.149407', 'step': 2860, 'epoch': 3} {'type': 'pplx', 'content': 56.447537887231604, 'timestamp': '2025-09-05 09:11:07.153770', 'step': 2860, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:07.324510', 'step': 2860, 'epoch': 3} {'type': 'loss', 'content': 0.2908163368701935, 'timestamp': '2025-09-05 09:11:07.329103', 'step': 2861, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:07.535867', 'step': 2861, 'epoch': 3} {'type': 'loss', 'content': 0.272657185792923, 'timestamp': '2025-09-05 09:11:07.538006', 'step': 2862, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:07.788214', 'step': 2862, 'epoch': 3} {'type': 'loss', 'content': 0.30033841729164124, 'timestamp': '2025-09-05 09:11:07.790608', 'step': 2863, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:07.999472', 'step': 2863, 'epoch': 3} {'type': 'loss', 'content': 0.3589075207710266, 'timestamp': '2025-09-05 09:11:08.008266', 'step': 2864, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:08.174062', 'step': 2864, 'epoch': 3} {'type': 'loss', 'content': 0.2896050810813904, 'timestamp': '2025-09-05 09:11:08.177026', 'step': 2865, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:08.385757', 'step': 2865, 'epoch': 3} {'type': 'loss', 'content': 0.303393691778183, 'timestamp': '2025-09-05 09:11:08.390120', 'step': 2866, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:08.567458', 'step': 2866, 'epoch': 3} {'type': 'loss', 'content': 0.2631215453147888, 'timestamp': '2025-09-05 09:11:08.570872', 'step': 2867, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:08.732555', 'step': 2867, 'epoch': 3} {'type': 'loss', 'content': 0.17335376143455505, 'timestamp': '2025-09-05 09:11:08.749457', 'step': 2868, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:11:08.917535', 'step': 2868, 'epoch': 3} {'type': 'loss', 'content': 0.3204255998134613, 'timestamp': '2025-09-05 09:11:08.920552', 'step': 2869, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:09.083644', 'step': 2869, 'epoch': 3} {'type': 'loss', 'content': 0.2697032392024994, 'timestamp': '2025-09-05 09:11:09.117302', 'step': 2870, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:09.304974', 'step': 2870, 'epoch': 3} {'type': 'loss', 'content': 0.2992748022079468, 'timestamp': '2025-09-05 09:11:09.307880', 'step': 2871, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:09.489092', 'step': 2871, 'epoch': 3} {'type': 'loss', 'content': 0.4421190619468689, 'timestamp': '2025-09-05 09:11:09.503449', 'step': 2872, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:11:09.655522', 'step': 2872, 'epoch': 3} {'type': 'loss', 'content': 0.3196788728237152, 'timestamp': '2025-09-05 09:11:09.657440', 'step': 2873, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:09.845994', 'step': 2873, 'epoch': 3} {'type': 'loss', 'content': 0.35318523645401, 'timestamp': '2025-09-05 09:11:09.848550', 'step': 2874, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:10.074877', 'step': 2874, 'epoch': 3} {'type': 'loss', 'content': 0.2514253556728363, 'timestamp': '2025-09-05 09:11:10.078009', 'step': 2875, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:10.251551', 'step': 2875, 'epoch': 3} {'type': 'loss', 'content': 0.26804113388061523, 'timestamp': '2025-09-05 09:11:10.265438', 'step': 2876, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:10.417155', 'step': 2876, 'epoch': 3} {'type': 'loss', 'content': 0.4025658071041107, 'timestamp': '2025-09-05 09:11:10.419938', 'step': 2877, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:10.590848', 'step': 2877, 'epoch': 3} {'type': 'loss', 'content': 0.30375415086746216, 'timestamp': '2025-09-05 09:11:10.593530', 'step': 2878, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:10.848347', 'step': 2878, 'epoch': 3} {'type': 'loss', 'content': 0.20703478157520294, 'timestamp': '2025-09-05 09:11:10.850887', 'step': 2879, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:11.077416', 'step': 2879, 'epoch': 3} {'type': 'loss', 'content': 0.2653372287750244, 'timestamp': '2025-09-05 09:11:11.094236', 'step': 2880, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:11:16.273351', 'step': 2880, 'epoch': 3} {'type': 'pplx', 'content': 56.14329639107681, 'timestamp': '2025-09-05 09:11:16.275174', 'step': 2880, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2880', 'timestamp': '2025-09-05 09:11:16.727654', 'step': 2880, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:16.902534', 'step': 2880, 'epoch': 3} {'type': 'loss', 'content': 0.18782301247119904, 'timestamp': '2025-09-05 09:11:16.904887', 'step': 2881, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:17.108973', 'step': 2881, 'epoch': 3} {'type': 'loss', 'content': 0.46404707431793213, 'timestamp': '2025-09-05 09:11:17.111598', 'step': 2882, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:17.330037', 'step': 2882, 'epoch': 3} {'type': 'loss', 'content': 0.23860648274421692, 'timestamp': '2025-09-05 09:11:17.332733', 'step': 2883, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:17.579843', 'step': 2883, 'epoch': 3} {'type': 'loss', 'content': 0.3546093702316284, 'timestamp': '2025-09-05 09:11:17.594163', 'step': 2884, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:17.783185', 'step': 2884, 'epoch': 3} {'type': 'loss', 'content': 0.2759442627429962, 'timestamp': '2025-09-05 09:11:17.785356', 'step': 2885, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:11:17.981116', 'step': 2885, 'epoch': 3} {'type': 'loss', 'content': 0.28804221749305725, 'timestamp': '2025-09-05 09:11:17.983364', 'step': 2886, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:18.178205', 'step': 2886, 'epoch': 3} {'type': 'loss', 'content': 0.2742845416069031, 'timestamp': '2025-09-05 09:11:18.180403', 'step': 2887, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:18.385459', 'step': 2887, 'epoch': 3} {'type': 'loss', 'content': 0.2764730155467987, 'timestamp': '2025-09-05 09:11:18.394721', 'step': 2888, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:11:18.557787', 'step': 2888, 'epoch': 3} {'type': 'loss', 'content': 0.29613184928894043, 'timestamp': '2025-09-05 09:11:18.559668', 'step': 2889, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:18.763335', 'step': 2889, 'epoch': 3} {'type': 'loss', 'content': 0.3713921010494232, 'timestamp': '2025-09-05 09:11:18.765453', 'step': 2890, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:11:18.969399', 'step': 2890, 'epoch': 3} {'type': 'loss', 'content': 0.3053815960884094, 'timestamp': '2025-09-05 09:11:18.971277', 'step': 2891, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:19.176772', 'step': 2891, 'epoch': 3} {'type': 'loss', 'content': 0.2894931137561798, 'timestamp': '2025-09-05 09:11:19.190110', 'step': 2892, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:11:19.381008', 'step': 2892, 'epoch': 3} {'type': 'loss', 'content': 0.449165016412735, 'timestamp': '2025-09-05 09:11:19.383226', 'step': 2893, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:19.675218', 'step': 2893, 'epoch': 3} {'type': 'loss', 'content': 0.28056976199150085, 'timestamp': '2025-09-05 09:11:19.677221', 'step': 2894, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:19.884855', 'step': 2894, 'epoch': 3} {'type': 'loss', 'content': 0.40280672907829285, 'timestamp': '2025-09-05 09:11:19.887529', 'step': 2895, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:20.083466', 'step': 2895, 'epoch': 3} {'type': 'loss', 'content': 0.3817463219165802, 'timestamp': '2025-09-05 09:11:20.097033', 'step': 2896, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:20.289971', 'step': 2896, 'epoch': 3} {'type': 'loss', 'content': 0.38125133514404297, 'timestamp': '2025-09-05 09:11:20.292065', 'step': 2897, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:20.487917', 'step': 2897, 'epoch': 3} {'type': 'loss', 'content': 0.29178139567375183, 'timestamp': '2025-09-05 09:11:20.489961', 'step': 2898, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:11:20.694429', 'step': 2898, 'epoch': 3} {'type': 'loss', 'content': 0.41615185141563416, 'timestamp': '2025-09-05 09:11:20.696952', 'step': 2899, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:20.894823', 'step': 2899, 'epoch': 3} {'type': 'loss', 'content': 0.2605353593826294, 'timestamp': '2025-09-05 09:11:20.950986', 'step': 2900, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:11:27.057789', 'step': 2900, 'epoch': 3} {'type': 'pplx', 'content': 55.594667031207955, 'timestamp': '2025-09-05 09:11:27.061928', 'step': 2900, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:11:27.195690', 'step': 2900, 'epoch': 3} {'type': 'loss', 'content': 0.3752339482307434, 'timestamp': '2025-09-05 09:11:27.198217', 'step': 2901, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:27.340205', 'step': 2901, 'epoch': 3} {'type': 'loss', 'content': 0.2262498438358307, 'timestamp': '2025-09-05 09:11:27.342415', 'step': 2902, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:27.478265', 'step': 2902, 'epoch': 3} {'type': 'loss', 'content': 0.2441215217113495, 'timestamp': '2025-09-05 09:11:27.480989', 'step': 2903, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:11:27.661026', 'step': 2903, 'epoch': 3} {'type': 'loss', 'content': 0.32166409492492676, 'timestamp': '2025-09-05 09:11:27.670179', 'step': 2904, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:27.804903', 'step': 2904, 'epoch': 3} {'type': 'loss', 'content': 0.3819389045238495, 'timestamp': '2025-09-05 09:11:27.806864', 'step': 2905, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:11:27.941693', 'step': 2905, 'epoch': 3} {'type': 'loss', 'content': 0.3978331387042999, 'timestamp': '2025-09-05 09:11:27.944445', 'step': 2906, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:28.082659', 'step': 2906, 'epoch': 3} {'type': 'loss', 'content': 0.16381220519542694, 'timestamp': '2025-09-05 09:11:28.085360', 'step': 2907, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:28.311273', 'step': 2907, 'epoch': 3} {'type': 'loss', 'content': 0.29129958152770996, 'timestamp': '2025-09-05 09:11:28.327758', 'step': 2908, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:11:28.495832', 'step': 2908, 'epoch': 3} {'type': 'loss', 'content': 0.24345554411411285, 'timestamp': '2025-09-05 09:11:28.498309', 'step': 2909, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:28.659067', 'step': 2909, 'epoch': 3} {'type': 'loss', 'content': 0.3856002986431122, 'timestamp': '2025-09-05 09:11:28.661532', 'step': 2910, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:29.055825', 'step': 2910, 'epoch': 3} {'type': 'loss', 'content': 0.32962942123413086, 'timestamp': '2025-09-05 09:11:29.058096', 'step': 2911, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:11:29.295803', 'step': 2911, 'epoch': 3} {'type': 'loss', 'content': 0.4046390652656555, 'timestamp': '2025-09-05 09:11:29.311015', 'step': 2912, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:11:29.465817', 'step': 2912, 'epoch': 3} {'type': 'loss', 'content': 0.26469311118125916, 'timestamp': '2025-09-05 09:11:29.467655', 'step': 2913, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:11:29.635648', 'step': 2913, 'epoch': 3} {'type': 'loss', 'content': 0.2808852791786194, 'timestamp': '2025-09-05 09:11:29.637811', 'step': 2914, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:29.810312', 'step': 2914, 'epoch': 3} {'type': 'loss', 'content': 0.3875598907470703, 'timestamp': '2025-09-05 09:11:29.812426', 'step': 2915, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:29.974711', 'step': 2915, 'epoch': 3} {'type': 'loss', 'content': 0.23085609078407288, 'timestamp': '2025-09-05 09:11:29.991556', 'step': 2916, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:11:30.257525', 'step': 2916, 'epoch': 3} {'type': 'loss', 'content': 0.24958936870098114, 'timestamp': '2025-09-05 09:11:30.300724', 'step': 2917, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:30.744519', 'step': 2917, 'epoch': 3} {'type': 'loss', 'content': 0.1771358847618103, 'timestamp': '2025-09-05 09:11:30.774755', 'step': 2918, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:31.008046', 'step': 2918, 'epoch': 3} {'type': 'loss', 'content': 0.22553907334804535, 'timestamp': '2025-09-05 09:11:31.010166', 'step': 2919, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:31.185841', 'step': 2919, 'epoch': 3} {'type': 'loss', 'content': 0.24437619745731354, 'timestamp': '2025-09-05 09:11:31.203679', 'step': 2920, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:11:36.728102', 'step': 2920, 'epoch': 3} {'type': 'pplx', 'content': 54.64921801749989, 'timestamp': '2025-09-05 09:11:36.770957', 'step': 2920, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2920', 'timestamp': '2025-09-05 09:11:37.232785', 'step': 2920, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:37.417932', 'step': 2920, 'epoch': 3} {'type': 'loss', 'content': 0.21169906854629517, 'timestamp': '2025-09-05 09:11:37.420206', 'step': 2921, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:37.614738', 'step': 2921, 'epoch': 3} {'type': 'loss', 'content': 0.2853226363658905, 'timestamp': '2025-09-05 09:11:37.621981', 'step': 2922, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:37.875781', 'step': 2922, 'epoch': 3} {'type': 'loss', 'content': 0.43432602286338806, 'timestamp': '2025-09-05 09:11:37.878382', 'step': 2923, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:38.169912', 'step': 2923, 'epoch': 3} {'type': 'loss', 'content': 0.340687096118927, 'timestamp': '2025-09-05 09:11:38.184822', 'step': 2924, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:38.404087', 'step': 2924, 'epoch': 3} {'type': 'loss', 'content': 0.23280911147594452, 'timestamp': '2025-09-05 09:11:38.423992', 'step': 2925, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:38.717427', 'step': 2925, 'epoch': 3} {'type': 'loss', 'content': 0.3444267511367798, 'timestamp': '2025-09-05 09:11:38.725155', 'step': 2926, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 09:11:39.030007', 'step': 2926, 'epoch': 3} {'type': 'loss', 'content': 0.49055567383766174, 'timestamp': '2025-09-05 09:11:39.032125', 'step': 2927, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:39.236375', 'step': 2927, 'epoch': 3} {'type': 'loss', 'content': 0.15985403954982758, 'timestamp': '2025-09-05 09:11:39.249699', 'step': 2928, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:11:39.494175', 'step': 2928, 'epoch': 3} {'type': 'loss', 'content': 0.3456610441207886, 'timestamp': '2025-09-05 09:11:39.537446', 'step': 2929, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:39.746492', 'step': 2929, 'epoch': 3} {'type': 'loss', 'content': 0.3735094368457794, 'timestamp': '2025-09-05 09:11:39.748659', 'step': 2930, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:39.951823', 'step': 2930, 'epoch': 3} {'type': 'loss', 'content': 0.24350151419639587, 'timestamp': '2025-09-05 09:11:39.954047', 'step': 2931, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:40.120027', 'step': 2931, 'epoch': 3} {'type': 'loss', 'content': 0.16938263177871704, 'timestamp': '2025-09-05 09:11:40.142112', 'step': 2932, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:11:40.415486', 'step': 2932, 'epoch': 3} {'type': 'loss', 'content': 0.2621157169342041, 'timestamp': '2025-09-05 09:11:40.417763', 'step': 2933, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:40.629949', 'step': 2933, 'epoch': 3} {'type': 'loss', 'content': 0.29170507192611694, 'timestamp': '2025-09-05 09:11:40.632243', 'step': 2934, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:40.840341', 'step': 2934, 'epoch': 3} {'type': 'loss', 'content': 0.20361952483654022, 'timestamp': '2025-09-05 09:11:40.842802', 'step': 2935, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:41.091216', 'step': 2935, 'epoch': 3} {'type': 'loss', 'content': 0.2855290174484253, 'timestamp': '2025-09-05 09:11:41.105181', 'step': 2936, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:41.293735', 'step': 2936, 'epoch': 3} {'type': 'loss', 'content': 0.2542724609375, 'timestamp': '2025-09-05 09:11:41.295789', 'step': 2937, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:11:41.488491', 'step': 2937, 'epoch': 3} {'type': 'loss', 'content': 0.21698574721813202, 'timestamp': '2025-09-05 09:11:41.490288', 'step': 2938, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:41.684731', 'step': 2938, 'epoch': 3} {'type': 'loss', 'content': 0.24226777255535126, 'timestamp': '2025-09-05 09:11:41.687400', 'step': 2939, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:11:41.979428', 'step': 2939, 'epoch': 3} {'type': 'loss', 'content': 0.24583564698696136, 'timestamp': '2025-09-05 09:11:41.992509', 'step': 2940, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:11:47.784860', 'step': 2940, 'epoch': 3} {'type': 'pplx', 'content': 54.78391043335974, 'timestamp': '2025-09-05 09:11:47.787728', 'step': 2940, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:48.031744', 'step': 2940, 'epoch': 3} {'type': 'loss', 'content': 0.2940543293952942, 'timestamp': '2025-09-05 09:11:48.050706', 'step': 2941, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:11:48.259904', 'step': 2941, 'epoch': 3} {'type': 'loss', 'content': 0.39327362179756165, 'timestamp': '2025-09-05 09:11:48.262429', 'step': 2942, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:48.460178', 'step': 2942, 'epoch': 3} {'type': 'loss', 'content': 0.38169828057289124, 'timestamp': '2025-09-05 09:11:48.462726', 'step': 2943, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:48.669720', 'step': 2943, 'epoch': 3} {'type': 'loss', 'content': 0.3624424338340759, 'timestamp': '2025-09-05 09:11:48.724431', 'step': 2944, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:48.968080', 'step': 2944, 'epoch': 3} {'type': 'loss', 'content': 0.1788206547498703, 'timestamp': '2025-09-05 09:11:49.012455', 'step': 2945, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:11:49.352156', 'step': 2945, 'epoch': 3} {'type': 'loss', 'content': 0.32311442494392395, 'timestamp': '2025-09-05 09:11:49.354956', 'step': 2946, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:11:49.559728', 'step': 2946, 'epoch': 3} {'type': 'loss', 'content': 0.15347428619861603, 'timestamp': '2025-09-05 09:11:49.562279', 'step': 2947, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:49.767876', 'step': 2947, 'epoch': 3} {'type': 'loss', 'content': 0.3563332259654999, 'timestamp': '2025-09-05 09:11:49.825040', 'step': 2948, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:50.052862', 'step': 2948, 'epoch': 3} {'type': 'loss', 'content': 0.13164092600345612, 'timestamp': '2025-09-05 09:11:50.055820', 'step': 2949, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:50.263077', 'step': 2949, 'epoch': 3} {'type': 'loss', 'content': 0.41619938611984253, 'timestamp': '2025-09-05 09:11:50.265649', 'step': 2950, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:50.462108', 'step': 2950, 'epoch': 3} {'type': 'loss', 'content': 0.31873831152915955, 'timestamp': '2025-09-05 09:11:50.500202', 'step': 2951, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:11:50.748488', 'step': 2951, 'epoch': 3} {'type': 'loss', 'content': 0.38107752799987793, 'timestamp': '2025-09-05 09:11:50.762883', 'step': 2952, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:50.951238', 'step': 2952, 'epoch': 3} {'type': 'loss', 'content': 0.2561323344707489, 'timestamp': '2025-09-05 09:11:50.953267', 'step': 2953, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:51.157835', 'step': 2953, 'epoch': 3} {'type': 'loss', 'content': 0.42956051230430603, 'timestamp': '2025-09-05 09:11:51.200100', 'step': 2954, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:51.449318', 'step': 2954, 'epoch': 3} {'type': 'loss', 'content': 0.328776091337204, 'timestamp': '2025-09-05 09:11:51.451728', 'step': 2955, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:51.656524', 'step': 2955, 'epoch': 3} {'type': 'loss', 'content': 0.3320053815841675, 'timestamp': '2025-09-05 09:11:51.670659', 'step': 2956, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:11:51.863783', 'step': 2956, 'epoch': 3} {'type': 'loss', 'content': 0.42864733934402466, 'timestamp': '2025-09-05 09:11:51.866444', 'step': 2957, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:52.114254', 'step': 2957, 'epoch': 3} {'type': 'loss', 'content': 0.35642778873443604, 'timestamp': '2025-09-05 09:11:52.158212', 'step': 2958, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:52.458736', 'step': 2958, 'epoch': 3} {'type': 'loss', 'content': 0.46565619111061096, 'timestamp': '2025-09-05 09:11:52.461733', 'step': 2959, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:52.667328', 'step': 2959, 'epoch': 3} {'type': 'loss', 'content': 0.265375018119812, 'timestamp': '2025-09-05 09:11:52.682797', 'step': 2960, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:11:58.548505', 'step': 2960, 'epoch': 3} {'type': 'pplx', 'content': 55.55334624048589, 'timestamp': '2025-09-05 09:11:58.550509', 'step': 2960, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2960', 'timestamp': '2025-09-05 09:11:58.989021', 'step': 2960, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:11:59.158088', 'step': 2960, 'epoch': 3} {'type': 'loss', 'content': 0.38203999400138855, 'timestamp': '2025-09-05 09:11:59.160162', 'step': 2961, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:59.356216', 'step': 2961, 'epoch': 3} {'type': 'loss', 'content': 0.2547346353530884, 'timestamp': '2025-09-05 09:11:59.357933', 'step': 2962, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:11:59.552815', 'step': 2962, 'epoch': 3} {'type': 'loss', 'content': 0.27306175231933594, 'timestamp': '2025-09-05 09:11:59.554906', 'step': 2963, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:11:59.751367', 'step': 2963, 'epoch': 3} {'type': 'loss', 'content': 0.3331190347671509, 'timestamp': '2025-09-05 09:11:59.766971', 'step': 2964, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:11:59.963517', 'step': 2964, 'epoch': 3} {'type': 'loss', 'content': 0.47358471155166626, 'timestamp': '2025-09-05 09:11:59.965670', 'step': 2965, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:00.174056', 'step': 2965, 'epoch': 3} {'type': 'loss', 'content': 0.2792886197566986, 'timestamp': '2025-09-05 09:12:00.175714', 'step': 2966, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:00.370898', 'step': 2966, 'epoch': 3} {'type': 'loss', 'content': 0.32101067900657654, 'timestamp': '2025-09-05 09:12:00.372845', 'step': 2967, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:00.538749', 'step': 2967, 'epoch': 3} {'type': 'loss', 'content': 0.27621304988861084, 'timestamp': '2025-09-05 09:12:00.552344', 'step': 2968, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:00.741327', 'step': 2968, 'epoch': 3} {'type': 'loss', 'content': 0.415333092212677, 'timestamp': '2025-09-05 09:12:00.743833', 'step': 2969, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:00.950697', 'step': 2969, 'epoch': 3} {'type': 'loss', 'content': 0.26739320158958435, 'timestamp': '2025-09-05 09:12:00.952648', 'step': 2970, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:12:01.159894', 'step': 2970, 'epoch': 3} {'type': 'loss', 'content': 0.24618351459503174, 'timestamp': '2025-09-05 09:12:01.162045', 'step': 2971, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:01.324984', 'step': 2971, 'epoch': 3} {'type': 'loss', 'content': 0.20293676853179932, 'timestamp': '2025-09-05 09:12:01.342168', 'step': 2972, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:01.538887', 'step': 2972, 'epoch': 3} {'type': 'loss', 'content': 0.36944884061813354, 'timestamp': '2025-09-05 09:12:01.540964', 'step': 2973, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:01.738840', 'step': 2973, 'epoch': 3} {'type': 'loss', 'content': 0.28939300775527954, 'timestamp': '2025-09-05 09:12:01.743461', 'step': 2974, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:01.953118', 'step': 2974, 'epoch': 3} {'type': 'loss', 'content': 0.37455692887306213, 'timestamp': '2025-09-05 09:12:01.955446', 'step': 2975, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:02.165822', 'step': 2975, 'epoch': 3} {'type': 'loss', 'content': 0.3490968644618988, 'timestamp': '2025-09-05 09:12:02.179427', 'step': 2976, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:02.375029', 'step': 2976, 'epoch': 3} {'type': 'loss', 'content': 0.375367134809494, 'timestamp': '2025-09-05 09:12:02.378208', 'step': 2977, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:02.574703', 'step': 2977, 'epoch': 3} {'type': 'loss', 'content': 0.2835385799407959, 'timestamp': '2025-09-05 09:12:02.576904', 'step': 2978, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:02.771920', 'step': 2978, 'epoch': 3} {'type': 'loss', 'content': 0.39227986335754395, 'timestamp': '2025-09-05 09:12:02.774121', 'step': 2979, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:02.968704', 'step': 2979, 'epoch': 3} {'type': 'loss', 'content': 0.29060855507850647, 'timestamp': '2025-09-05 09:12:02.985594', 'step': 2980, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:12:07.636923', 'step': 2980, 'epoch': 3} {'type': 'pplx', 'content': 54.606959405023254, 'timestamp': '2025-09-05 09:12:07.638898', 'step': 2980, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:07.798582', 'step': 2980, 'epoch': 3} {'type': 'loss', 'content': 0.20793351531028748, 'timestamp': '2025-09-05 09:12:07.800534', 'step': 2981, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:07.965219', 'step': 2981, 'epoch': 3} {'type': 'loss', 'content': 0.2619383931159973, 'timestamp': '2025-09-05 09:12:07.967179', 'step': 2982, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:08.171327', 'step': 2982, 'epoch': 3} {'type': 'loss', 'content': 0.27213922142982483, 'timestamp': '2025-09-05 09:12:08.173305', 'step': 2983, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:08.376566', 'step': 2983, 'epoch': 3} {'type': 'loss', 'content': 0.2601911425590515, 'timestamp': '2025-09-05 09:12:08.385366', 'step': 2984, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:08.547557', 'step': 2984, 'epoch': 3} {'type': 'loss', 'content': 0.2833835184574127, 'timestamp': '2025-09-05 09:12:08.549483', 'step': 2985, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:08.754222', 'step': 2985, 'epoch': 3} {'type': 'loss', 'content': 0.40964579582214355, 'timestamp': '2025-09-05 09:12:08.756656', 'step': 2986, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:08.961846', 'step': 2986, 'epoch': 3} {'type': 'loss', 'content': 0.41075843572616577, 'timestamp': '2025-09-05 09:12:08.964043', 'step': 2987, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:09.168674', 'step': 2987, 'epoch': 3} {'type': 'loss', 'content': 0.43984243273735046, 'timestamp': '2025-09-05 09:12:09.182401', 'step': 2988, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:09.368376', 'step': 2988, 'epoch': 3} {'type': 'loss', 'content': 0.40275534987449646, 'timestamp': '2025-09-05 09:12:09.370394', 'step': 2989, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:09.574501', 'step': 2989, 'epoch': 3} {'type': 'loss', 'content': 0.32596346735954285, 'timestamp': '2025-09-05 09:12:09.576459', 'step': 2990, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:09.741903', 'step': 2990, 'epoch': 3} {'type': 'loss', 'content': 0.3702400326728821, 'timestamp': '2025-09-05 09:12:09.744360', 'step': 2991, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:09.934911', 'step': 2991, 'epoch': 3} {'type': 'loss', 'content': 0.33719366788864136, 'timestamp': '2025-09-05 09:12:09.943923', 'step': 2992, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:10.102127', 'step': 2992, 'epoch': 3} {'type': 'loss', 'content': 0.2717023491859436, 'timestamp': '2025-09-05 09:12:10.104021', 'step': 2993, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:10.308830', 'step': 2993, 'epoch': 3} {'type': 'loss', 'content': 0.40143582224845886, 'timestamp': '2025-09-05 09:12:10.311152', 'step': 2994, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:12:10.503737', 'step': 2994, 'epoch': 3} {'type': 'loss', 'content': 0.2984316349029541, 'timestamp': '2025-09-05 09:12:10.510745', 'step': 2995, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:10.706382', 'step': 2995, 'epoch': 3} {'type': 'loss', 'content': 0.14753331243991852, 'timestamp': '2025-09-05 09:12:10.715441', 'step': 2996, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:10.876908', 'step': 2996, 'epoch': 3} {'type': 'loss', 'content': 0.2177143543958664, 'timestamp': '2025-09-05 09:12:10.879118', 'step': 2997, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:11.082860', 'step': 2997, 'epoch': 3} {'type': 'loss', 'content': 0.39515912532806396, 'timestamp': '2025-09-05 09:12:11.085024', 'step': 2998, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:11.252542', 'step': 2998, 'epoch': 3} {'type': 'loss', 'content': 0.3414614498615265, 'timestamp': '2025-09-05 09:12:11.254655', 'step': 2999, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:11.448211', 'step': 2999, 'epoch': 3} {'type': 'loss', 'content': 0.34329718351364136, 'timestamp': '2025-09-05 09:12:11.462044', 'step': 3000, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:12:16.107356', 'step': 3000, 'epoch': 3} {'type': 'pplx', 'content': 53.10410466038564, 'timestamp': '2025-09-05 09:12:16.109410', 'step': 3000, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3000', 'timestamp': '2025-09-05 09:12:16.567964', 'step': 3000, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:16.746854', 'step': 3000, 'epoch': 3} {'type': 'loss', 'content': 0.5219593048095703, 'timestamp': '2025-09-05 09:12:16.748733', 'step': 3001, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:16.914361', 'step': 3001, 'epoch': 3} {'type': 'loss', 'content': 0.22384533286094666, 'timestamp': '2025-09-05 09:12:16.916455', 'step': 3002, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:17.118982', 'step': 3002, 'epoch': 3} {'type': 'loss', 'content': 0.35147175192832947, 'timestamp': '2025-09-05 09:12:17.121318', 'step': 3003, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:17.324431', 'step': 3003, 'epoch': 3} {'type': 'loss', 'content': 0.3589687943458557, 'timestamp': '2025-09-05 09:12:17.340452', 'step': 3004, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:17.535790', 'step': 3004, 'epoch': 3} {'type': 'loss', 'content': 0.27491340041160583, 'timestamp': '2025-09-05 09:12:17.538762', 'step': 3005, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:12:17.732860', 'step': 3005, 'epoch': 3} {'type': 'loss', 'content': 0.2931725084781647, 'timestamp': '2025-09-05 09:12:17.735179', 'step': 3006, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:17.937747', 'step': 3006, 'epoch': 3} {'type': 'loss', 'content': 0.299150675535202, 'timestamp': '2025-09-05 09:12:17.940089', 'step': 3007, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:18.105046', 'step': 3007, 'epoch': 3} {'type': 'loss', 'content': 0.30979087948799133, 'timestamp': '2025-09-05 09:12:18.120936', 'step': 3008, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:18.317017', 'step': 3008, 'epoch': 3} {'type': 'loss', 'content': 0.26512518525123596, 'timestamp': '2025-09-05 09:12:18.319197', 'step': 3009, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:18.513289', 'step': 3009, 'epoch': 3} {'type': 'loss', 'content': 0.1896071434020996, 'timestamp': '2025-09-05 09:12:18.515493', 'step': 3010, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:18.711265', 'step': 3010, 'epoch': 3} {'type': 'loss', 'content': 0.3693452477455139, 'timestamp': '2025-09-05 09:12:18.713314', 'step': 3011, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:18.915519', 'step': 3011, 'epoch': 3} {'type': 'loss', 'content': 0.2476043701171875, 'timestamp': '2025-09-05 09:12:18.929455', 'step': 3012, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:19.117546', 'step': 3012, 'epoch': 3} {'type': 'loss', 'content': 0.3103981912136078, 'timestamp': '2025-09-05 09:12:19.119840', 'step': 3013, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:19.324252', 'step': 3013, 'epoch': 3} {'type': 'loss', 'content': 0.38308852910995483, 'timestamp': '2025-09-05 09:12:19.326376', 'step': 3014, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:19.530283', 'step': 3014, 'epoch': 3} {'type': 'loss', 'content': 0.27662599086761475, 'timestamp': '2025-09-05 09:12:19.532388', 'step': 3015, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:19.725363', 'step': 3015, 'epoch': 3} {'type': 'loss', 'content': 0.2989875376224518, 'timestamp': '2025-09-05 09:12:19.739697', 'step': 3016, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:12:19.924782', 'step': 3016, 'epoch': 3} {'type': 'loss', 'content': 0.2708384394645691, 'timestamp': '2025-09-05 09:12:19.927781', 'step': 3017, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:20.122071', 'step': 3017, 'epoch': 3} {'type': 'loss', 'content': 0.2433338612318039, 'timestamp': '2025-09-05 09:12:20.124302', 'step': 3018, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:20.288372', 'step': 3018, 'epoch': 3} {'type': 'loss', 'content': 0.304372102022171, 'timestamp': '2025-09-05 09:12:20.291067', 'step': 3019, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:20.495596', 'step': 3019, 'epoch': 3} {'type': 'loss', 'content': 0.27465543150901794, 'timestamp': '2025-09-05 09:12:20.509765', 'step': 3020, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:12:25.167359', 'step': 3020, 'epoch': 3} {'type': 'pplx', 'content': 52.50252237851087, 'timestamp': '2025-09-05 09:12:25.169469', 'step': 3020, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:25.330830', 'step': 3020, 'epoch': 3} {'type': 'loss', 'content': 0.2500014305114746, 'timestamp': '2025-09-05 09:12:25.332814', 'step': 3021, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:25.496063', 'step': 3021, 'epoch': 3} {'type': 'loss', 'content': 0.2104569673538208, 'timestamp': '2025-09-05 09:12:25.498366', 'step': 3022, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:25.701436', 'step': 3022, 'epoch': 3} {'type': 'loss', 'content': 0.1959143728017807, 'timestamp': '2025-09-05 09:12:25.703511', 'step': 3023, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:25.909136', 'step': 3023, 'epoch': 3} {'type': 'loss', 'content': 0.3365468978881836, 'timestamp': '2025-09-05 09:12:25.923098', 'step': 3024, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:26.118433', 'step': 3024, 'epoch': 3} {'type': 'loss', 'content': 0.29884475469589233, 'timestamp': '2025-09-05 09:12:26.120908', 'step': 3025, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:26.316247', 'step': 3025, 'epoch': 3} {'type': 'loss', 'content': 0.2602010667324066, 'timestamp': '2025-09-05 09:12:26.318653', 'step': 3026, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:26.524920', 'step': 3026, 'epoch': 3} {'type': 'loss', 'content': 0.28113895654678345, 'timestamp': '2025-09-05 09:12:26.527074', 'step': 3027, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:12:26.722996', 'step': 3027, 'epoch': 3} {'type': 'loss', 'content': 0.34172090888023376, 'timestamp': '2025-09-05 09:12:26.741293', 'step': 3028, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:26.938961', 'step': 3028, 'epoch': 3} {'type': 'loss', 'content': 0.401425838470459, 'timestamp': '2025-09-05 09:12:26.940826', 'step': 3029, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:27.146216', 'step': 3029, 'epoch': 3} {'type': 'loss', 'content': 0.2900570034980774, 'timestamp': '2025-09-05 09:12:27.148185', 'step': 3030, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:27.343391', 'step': 3030, 'epoch': 3} {'type': 'loss', 'content': 0.2659049928188324, 'timestamp': '2025-09-05 09:12:27.345656', 'step': 3031, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:27.550953', 'step': 3031, 'epoch': 3} {'type': 'loss', 'content': 0.30906596779823303, 'timestamp': '2025-09-05 09:12:27.567599', 'step': 3032, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:27.761515', 'step': 3032, 'epoch': 3} {'type': 'loss', 'content': 0.4450221657752991, 'timestamp': '2025-09-05 09:12:27.763636', 'step': 3033, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:27.959754', 'step': 3033, 'epoch': 3} {'type': 'loss', 'content': 0.35517576336860657, 'timestamp': '2025-09-05 09:12:27.962312', 'step': 3034, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:28.167097', 'step': 3034, 'epoch': 3} {'type': 'loss', 'content': 0.4547875225543976, 'timestamp': '2025-09-05 09:12:28.169352', 'step': 3035, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:28.364541', 'step': 3035, 'epoch': 3} {'type': 'loss', 'content': 0.2977902591228485, 'timestamp': '2025-09-05 09:12:28.381078', 'step': 3036, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:28.574247', 'step': 3036, 'epoch': 3} {'type': 'loss', 'content': 0.2584066092967987, 'timestamp': '2025-09-05 09:12:28.576226', 'step': 3037, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:28.778959', 'step': 3037, 'epoch': 3} {'type': 'loss', 'content': 0.22676676511764526, 'timestamp': '2025-09-05 09:12:28.781181', 'step': 3038, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:28.974249', 'step': 3038, 'epoch': 3} {'type': 'loss', 'content': 0.20184023678302765, 'timestamp': '2025-09-05 09:12:28.976433', 'step': 3039, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:29.179150', 'step': 3039, 'epoch': 3} {'type': 'loss', 'content': 0.3374442458152771, 'timestamp': '2025-09-05 09:12:29.195816', 'step': 3040, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:12:33.868272', 'step': 3040, 'epoch': 3} {'type': 'pplx', 'content': 52.40073254912315, 'timestamp': '2025-09-05 09:12:33.870120', 'step': 3040, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3040', 'timestamp': '2025-09-05 09:12:34.326846', 'step': 3040, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:34.467443', 'step': 3040, 'epoch': 3} {'type': 'loss', 'content': 0.4248269498348236, 'timestamp': '2025-09-05 09:12:34.469714', 'step': 3041, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:34.636452', 'step': 3041, 'epoch': 3} {'type': 'loss', 'content': 0.25288718938827515, 'timestamp': '2025-09-05 09:12:34.638663', 'step': 3042, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:34.809846', 'step': 3042, 'epoch': 3} {'type': 'loss', 'content': 0.31233614683151245, 'timestamp': '2025-09-05 09:12:34.811951', 'step': 3043, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:34.970532', 'step': 3043, 'epoch': 3} {'type': 'loss', 'content': 0.3236229419708252, 'timestamp': '2025-09-05 09:12:34.984081', 'step': 3044, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:35.136091', 'step': 3044, 'epoch': 3} {'type': 'loss', 'content': 0.19996929168701172, 'timestamp': '2025-09-05 09:12:35.138576', 'step': 3045, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:35.296364', 'step': 3045, 'epoch': 3} {'type': 'loss', 'content': 0.2629333436489105, 'timestamp': '2025-09-05 09:12:35.298689', 'step': 3046, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:12:35.468744', 'step': 3046, 'epoch': 3} {'type': 'loss', 'content': 0.20324602723121643, 'timestamp': '2025-09-05 09:12:35.470496', 'step': 3047, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:35.628681', 'step': 3047, 'epoch': 3} {'type': 'loss', 'content': 0.3019815981388092, 'timestamp': '2025-09-05 09:12:35.642804', 'step': 3048, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:35.797526', 'step': 3048, 'epoch': 3} {'type': 'loss', 'content': 0.2801806628704071, 'timestamp': '2025-09-05 09:12:35.800581', 'step': 3049, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:12:35.970991', 'step': 3049, 'epoch': 3} {'type': 'loss', 'content': 0.27385973930358887, 'timestamp': '2025-09-05 09:12:35.973122', 'step': 3050, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:36.136878', 'step': 3050, 'epoch': 3} {'type': 'loss', 'content': 0.30080705881118774, 'timestamp': '2025-09-05 09:12:36.139304', 'step': 3051, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:36.307841', 'step': 3051, 'epoch': 3} {'type': 'loss', 'content': 0.3155876398086548, 'timestamp': '2025-09-05 09:12:36.324192', 'step': 3052, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:36.485707', 'step': 3052, 'epoch': 3} {'type': 'loss', 'content': 0.3727303445339203, 'timestamp': '2025-09-05 09:12:36.488132', 'step': 3053, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:36.646680', 'step': 3053, 'epoch': 3} {'type': 'loss', 'content': 0.21063818037509918, 'timestamp': '2025-09-05 09:12:36.648771', 'step': 3054, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:36.806161', 'step': 3054, 'epoch': 3} {'type': 'loss', 'content': 0.37034881114959717, 'timestamp': '2025-09-05 09:12:36.808221', 'step': 3055, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:36.967297', 'step': 3055, 'epoch': 3} {'type': 'loss', 'content': 0.3373739719390869, 'timestamp': '2025-09-05 09:12:36.980504', 'step': 3056, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:37.134015', 'step': 3056, 'epoch': 3} {'type': 'loss', 'content': 0.2569211721420288, 'timestamp': '2025-09-05 09:12:37.136192', 'step': 3057, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:12:37.293082', 'step': 3057, 'epoch': 3} {'type': 'loss', 'content': 0.22303147614002228, 'timestamp': '2025-09-05 09:12:37.296720', 'step': 3058, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:37.472309', 'step': 3058, 'epoch': 3} {'type': 'loss', 'content': 0.2923431396484375, 'timestamp': '2025-09-05 09:12:37.474200', 'step': 3059, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:12:37.634249', 'step': 3059, 'epoch': 3} {'type': 'loss', 'content': 0.41064414381980896, 'timestamp': '2025-09-05 09:12:37.648170', 'step': 3060, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:12:42.310853', 'step': 3060, 'epoch': 3} {'type': 'pplx', 'content': 53.04668741895991, 'timestamp': '2025-09-05 09:12:42.313061', 'step': 3060, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:42.445731', 'step': 3060, 'epoch': 3} {'type': 'loss', 'content': 0.2520500719547272, 'timestamp': '2025-09-05 09:12:42.448010', 'step': 3061, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:42.584143', 'step': 3061, 'epoch': 3} {'type': 'loss', 'content': 0.33572205901145935, 'timestamp': '2025-09-05 09:12:42.588096', 'step': 3062, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:42.762657', 'step': 3062, 'epoch': 3} {'type': 'loss', 'content': 0.2813510298728943, 'timestamp': '2025-09-05 09:12:42.764778', 'step': 3063, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:42.925306', 'step': 3063, 'epoch': 3} {'type': 'loss', 'content': 0.23829667270183563, 'timestamp': '2025-09-05 09:12:42.941931', 'step': 3064, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:43.104263', 'step': 3064, 'epoch': 3} {'type': 'loss', 'content': 0.2956577241420746, 'timestamp': '2025-09-05 09:12:43.106274', 'step': 3065, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:43.266859', 'step': 3065, 'epoch': 3} {'type': 'loss', 'content': 0.24571780860424042, 'timestamp': '2025-09-05 09:12:43.269115', 'step': 3066, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:43.426658', 'step': 3066, 'epoch': 3} {'type': 'loss', 'content': 0.17806021869182587, 'timestamp': '2025-09-05 09:12:43.429191', 'step': 3067, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:43.600917', 'step': 3067, 'epoch': 3} {'type': 'loss', 'content': 0.2593998908996582, 'timestamp': '2025-09-05 09:12:43.609713', 'step': 3068, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:43.742291', 'step': 3068, 'epoch': 3} {'type': 'loss', 'content': 0.34801289439201355, 'timestamp': '2025-09-05 09:12:43.744763', 'step': 3069, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:43.912633', 'step': 3069, 'epoch': 3} {'type': 'loss', 'content': 0.5196738839149475, 'timestamp': '2025-09-05 09:12:43.914724', 'step': 3070, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:44.073133', 'step': 3070, 'epoch': 3} {'type': 'loss', 'content': 0.3646704852581024, 'timestamp': '2025-09-05 09:12:44.075393', 'step': 3071, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:44.248356', 'step': 3071, 'epoch': 3} {'type': 'loss', 'content': 0.3963429033756256, 'timestamp': '2025-09-05 09:12:44.265014', 'step': 3072, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:44.429735', 'step': 3072, 'epoch': 3} {'type': 'loss', 'content': 0.265206903219223, 'timestamp': '2025-09-05 09:12:44.432399', 'step': 3073, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:44.596582', 'step': 3073, 'epoch': 3} {'type': 'loss', 'content': 0.26692867279052734, 'timestamp': '2025-09-05 09:12:44.599011', 'step': 3074, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:44.772730', 'step': 3074, 'epoch': 3} {'type': 'loss', 'content': 0.45934414863586426, 'timestamp': '2025-09-05 09:12:44.774557', 'step': 3075, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:44.949906', 'step': 3075, 'epoch': 3} {'type': 'loss', 'content': 0.34467387199401855, 'timestamp': '2025-09-05 09:12:44.966102', 'step': 3076, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:45.159510', 'step': 3076, 'epoch': 3} {'type': 'loss', 'content': 0.2720514237880707, 'timestamp': '2025-09-05 09:12:45.161795', 'step': 3077, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:45.327367', 'step': 3077, 'epoch': 3} {'type': 'loss', 'content': 0.23640888929367065, 'timestamp': '2025-09-05 09:12:45.329368', 'step': 3078, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:45.492634', 'step': 3078, 'epoch': 3} {'type': 'loss', 'content': 0.13110221922397614, 'timestamp': '2025-09-05 09:12:45.494825', 'step': 3079, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:45.659733', 'step': 3079, 'epoch': 3} {'type': 'loss', 'content': 0.3650048077106476, 'timestamp': '2025-09-05 09:12:45.675789', 'step': 3080, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:12:50.314454', 'step': 3080, 'epoch': 3} {'type': 'pplx', 'content': 53.94136359098753, 'timestamp': '2025-09-05 09:12:50.316945', 'step': 3080, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3080', 'timestamp': '2025-09-05 09:12:50.796727', 'step': 3080, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:50.935383', 'step': 3080, 'epoch': 3} {'type': 'loss', 'content': 0.2984461188316345, 'timestamp': '2025-09-05 09:12:50.937442', 'step': 3081, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:51.107896', 'step': 3081, 'epoch': 3} {'type': 'loss', 'content': 0.3399762213230133, 'timestamp': '2025-09-05 09:12:51.109895', 'step': 3082, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:51.274249', 'step': 3082, 'epoch': 3} {'type': 'loss', 'content': 0.19832485914230347, 'timestamp': '2025-09-05 09:12:51.276230', 'step': 3083, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:51.440549', 'step': 3083, 'epoch': 3} {'type': 'loss', 'content': 0.3151915669441223, 'timestamp': '2025-09-05 09:12:51.454599', 'step': 3084, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:51.617929', 'step': 3084, 'epoch': 3} {'type': 'loss', 'content': 0.27868831157684326, 'timestamp': '2025-09-05 09:12:51.620026', 'step': 3085, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:51.786145', 'step': 3085, 'epoch': 3} {'type': 'loss', 'content': 0.3083263635635376, 'timestamp': '2025-09-05 09:12:51.788444', 'step': 3086, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:51.962175', 'step': 3086, 'epoch': 3} {'type': 'loss', 'content': 0.23392392694950104, 'timestamp': '2025-09-05 09:12:51.964192', 'step': 3087, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:52.128880', 'step': 3087, 'epoch': 3} {'type': 'loss', 'content': 0.36243969202041626, 'timestamp': '2025-09-05 09:12:52.142581', 'step': 3088, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:12:52.301060', 'step': 3088, 'epoch': 3} {'type': 'loss', 'content': 0.36660972237586975, 'timestamp': '2025-09-05 09:12:52.303374', 'step': 3089, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:52.467650', 'step': 3089, 'epoch': 3} {'type': 'loss', 'content': 0.1570238471031189, 'timestamp': '2025-09-05 09:12:52.469835', 'step': 3090, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:52.643306', 'step': 3090, 'epoch': 3} {'type': 'loss', 'content': 0.22328078746795654, 'timestamp': '2025-09-05 09:12:52.645564', 'step': 3091, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:52.810208', 'step': 3091, 'epoch': 3} {'type': 'loss', 'content': 0.34495118260383606, 'timestamp': '2025-09-05 09:12:52.827558', 'step': 3092, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:52.985898', 'step': 3092, 'epoch': 3} {'type': 'loss', 'content': 0.3345975875854492, 'timestamp': '2025-09-05 09:12:52.988073', 'step': 3093, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:53.151108', 'step': 3093, 'epoch': 3} {'type': 'loss', 'content': 0.31810086965560913, 'timestamp': '2025-09-05 09:12:53.154398', 'step': 3094, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:53.325100', 'step': 3094, 'epoch': 3} {'type': 'loss', 'content': 0.3396746814250946, 'timestamp': '2025-09-05 09:12:53.327879', 'step': 3095, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:53.492335', 'step': 3095, 'epoch': 3} {'type': 'loss', 'content': 0.319447785615921, 'timestamp': '2025-09-05 09:12:53.508844', 'step': 3096, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:53.675150', 'step': 3096, 'epoch': 3} {'type': 'loss', 'content': 0.422185480594635, 'timestamp': '2025-09-05 09:12:53.677142', 'step': 3097, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:12:53.845714', 'step': 3097, 'epoch': 3} {'type': 'loss', 'content': 0.24050793051719666, 'timestamp': '2025-09-05 09:12:53.848556', 'step': 3098, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:54.014849', 'step': 3098, 'epoch': 3} {'type': 'loss', 'content': 0.3138499855995178, 'timestamp': '2025-09-05 09:12:54.017092', 'step': 3099, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:12:54.180414', 'step': 3099, 'epoch': 3} {'type': 'loss', 'content': 0.20346899330615997, 'timestamp': '2025-09-05 09:12:54.194225', 'step': 3100, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:12:58.862549', 'step': 3100, 'epoch': 3} {'type': 'pplx', 'content': 54.427679569639615, 'timestamp': '2025-09-05 09:12:58.865548', 'step': 3100, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:59.033815', 'step': 3100, 'epoch': 3} {'type': 'loss', 'content': 0.30215319991111755, 'timestamp': '2025-09-05 09:12:59.035886', 'step': 3101, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:59.201630', 'step': 3101, 'epoch': 3} {'type': 'loss', 'content': 0.26254820823669434, 'timestamp': '2025-09-05 09:12:59.203802', 'step': 3102, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:12:59.409539', 'step': 3102, 'epoch': 3} {'type': 'loss', 'content': 0.32716652750968933, 'timestamp': '2025-09-05 09:12:59.412056', 'step': 3103, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:12:59.604460', 'step': 3103, 'epoch': 3} {'type': 'loss', 'content': 0.3003425896167755, 'timestamp': '2025-09-05 09:12:59.618822', 'step': 3104, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:12:59.806993', 'step': 3104, 'epoch': 3} {'type': 'loss', 'content': 0.2939733564853668, 'timestamp': '2025-09-05 09:12:59.809008', 'step': 3105, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:00.002760', 'step': 3105, 'epoch': 3} {'type': 'loss', 'content': 0.25660908222198486, 'timestamp': '2025-09-05 09:13:00.004380', 'step': 3106, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:00.208171', 'step': 3106, 'epoch': 3} {'type': 'loss', 'content': 0.23837247490882874, 'timestamp': '2025-09-05 09:13:00.209793', 'step': 3107, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:00.372238', 'step': 3107, 'epoch': 3} {'type': 'loss', 'content': 0.34475165605545044, 'timestamp': '2025-09-05 09:13:00.388481', 'step': 3108, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:00.580255', 'step': 3108, 'epoch': 3} {'type': 'loss', 'content': 0.22767935693264008, 'timestamp': '2025-09-05 09:13:00.581944', 'step': 3109, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:00.782539', 'step': 3109, 'epoch': 3} {'type': 'loss', 'content': 0.28474533557891846, 'timestamp': '2025-09-05 09:13:00.784441', 'step': 3110, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:13:00.949594', 'step': 3110, 'epoch': 3} {'type': 'loss', 'content': 0.19329321384429932, 'timestamp': '2025-09-05 09:13:00.951484', 'step': 3111, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:01.145321', 'step': 3111, 'epoch': 3} {'type': 'loss', 'content': 0.35781329870224, 'timestamp': '2025-09-05 09:13:01.158501', 'step': 3112, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:01.345180', 'step': 3112, 'epoch': 3} {'type': 'loss', 'content': 0.3530445694923401, 'timestamp': '2025-09-05 09:13:01.346934', 'step': 3113, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:01.540655', 'step': 3113, 'epoch': 3} {'type': 'loss', 'content': 0.40410447120666504, 'timestamp': '2025-09-05 09:13:01.542300', 'step': 3114, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:01.736198', 'step': 3114, 'epoch': 3} {'type': 'loss', 'content': 0.4058263301849365, 'timestamp': '2025-09-05 09:13:01.737932', 'step': 3115, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:01.930582', 'step': 3115, 'epoch': 3} {'type': 'loss', 'content': 0.27423328161239624, 'timestamp': '2025-09-05 09:13:01.944971', 'step': 3116, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:02.141696', 'step': 3116, 'epoch': 3} {'type': 'loss', 'content': 0.2607077956199646, 'timestamp': '2025-09-05 09:13:02.143699', 'step': 3117, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:13:02.340557', 'step': 3117, 'epoch': 3} {'type': 'loss', 'content': 0.42114734649658203, 'timestamp': '2025-09-05 09:13:02.342579', 'step': 3118, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:02.546126', 'step': 3118, 'epoch': 3} {'type': 'loss', 'content': 0.3532010316848755, 'timestamp': '2025-09-05 09:13:02.548110', 'step': 3119, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:02.742858', 'step': 3119, 'epoch': 3} {'type': 'loss', 'content': 0.23121574521064758, 'timestamp': '2025-09-05 09:13:02.758242', 'step': 3120, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:13:07.392319', 'step': 3120, 'epoch': 3} {'type': 'pplx', 'content': 55.011493179080475, 'timestamp': '2025-09-05 09:13:07.394549', 'step': 3120, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3120', 'timestamp': '2025-09-05 09:13:07.852680', 'step': 3120, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:08.021571', 'step': 3120, 'epoch': 3} {'type': 'loss', 'content': 0.3971679210662842, 'timestamp': '2025-09-05 09:13:08.023605', 'step': 3121, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:08.218220', 'step': 3121, 'epoch': 3} {'type': 'loss', 'content': 0.3160933256149292, 'timestamp': '2025-09-05 09:13:08.220278', 'step': 3122, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:08.423966', 'step': 3122, 'epoch': 3} {'type': 'loss', 'content': 0.31142422556877136, 'timestamp': '2025-09-05 09:13:08.426571', 'step': 3123, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:08.621738', 'step': 3123, 'epoch': 3} {'type': 'loss', 'content': 0.32644516229629517, 'timestamp': '2025-09-05 09:13:08.637658', 'step': 3124, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:08.833274', 'step': 3124, 'epoch': 3} {'type': 'loss', 'content': 0.30763083696365356, 'timestamp': '2025-09-05 09:13:08.835053', 'step': 3125, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:09.040750', 'step': 3125, 'epoch': 3} {'type': 'loss', 'content': 0.26925909519195557, 'timestamp': '2025-09-05 09:13:09.042723', 'step': 3126, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:09.237489', 'step': 3126, 'epoch': 3} {'type': 'loss', 'content': 0.3116042912006378, 'timestamp': '2025-09-05 09:13:09.239615', 'step': 3127, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:09.404887', 'step': 3127, 'epoch': 3} {'type': 'loss', 'content': 0.21474801003932953, 'timestamp': '2025-09-05 09:13:09.421229', 'step': 3128, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:13:09.619863', 'step': 3128, 'epoch': 3} {'type': 'loss', 'content': 0.26150596141815186, 'timestamp': '2025-09-05 09:13:09.622785', 'step': 3129, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:09.820954', 'step': 3129, 'epoch': 3} {'type': 'loss', 'content': 0.1961638182401657, 'timestamp': '2025-09-05 09:13:09.823856', 'step': 3130, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:10.020164', 'step': 3130, 'epoch': 3} {'type': 'loss', 'content': 0.40828442573547363, 'timestamp': '2025-09-05 09:13:10.022062', 'step': 3131, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:13:10.218948', 'step': 3131, 'epoch': 3} {'type': 'loss', 'content': 0.3171522915363312, 'timestamp': '2025-09-05 09:13:10.232774', 'step': 3132, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:10.419755', 'step': 3132, 'epoch': 3} {'type': 'loss', 'content': 0.32482245564460754, 'timestamp': '2025-09-05 09:13:10.421934', 'step': 3133, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:10.615055', 'step': 3133, 'epoch': 3} {'type': 'loss', 'content': 0.3538453280925751, 'timestamp': '2025-09-05 09:13:10.617063', 'step': 3134, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:10.815295', 'step': 3134, 'epoch': 3} {'type': 'loss', 'content': 0.24779179692268372, 'timestamp': '2025-09-05 09:13:10.817327', 'step': 3135, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:11.022062', 'step': 3135, 'epoch': 3} {'type': 'loss', 'content': 0.32560235261917114, 'timestamp': '2025-09-05 09:13:11.035980', 'step': 3136, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:13:11.226163', 'step': 3136, 'epoch': 3} {'type': 'loss', 'content': 0.21415184438228607, 'timestamp': '2025-09-05 09:13:11.228008', 'step': 3137, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 09:13:11.424295', 'step': 3137, 'epoch': 3} {'type': 'loss', 'content': 0.4218257665634155, 'timestamp': '2025-09-05 09:13:11.426517', 'step': 3138, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:13:11.622235', 'step': 3138, 'epoch': 3} {'type': 'loss', 'content': 0.23505598306655884, 'timestamp': '2025-09-05 09:13:11.624260', 'step': 3139, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:11.826337', 'step': 3139, 'epoch': 3} {'type': 'loss', 'content': 0.23702280223369598, 'timestamp': '2025-09-05 09:13:11.842648', 'step': 3140, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:13:16.493846', 'step': 3140, 'epoch': 3} {'type': 'pplx', 'content': 55.51095658927002, 'timestamp': '2025-09-05 09:13:16.496136', 'step': 3140, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:16.640097', 'step': 3140, 'epoch': 3} {'type': 'loss', 'content': 0.3143397867679596, 'timestamp': '2025-09-05 09:13:16.642303', 'step': 3141, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:16.807476', 'step': 3141, 'epoch': 3} {'type': 'loss', 'content': 0.3324390649795532, 'timestamp': '2025-09-05 09:13:16.809313', 'step': 3142, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:16.974136', 'step': 3142, 'epoch': 3} {'type': 'loss', 'content': 0.33971095085144043, 'timestamp': '2025-09-05 09:13:16.975975', 'step': 3143, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:17.141360', 'step': 3143, 'epoch': 3} {'type': 'loss', 'content': 0.19881880283355713, 'timestamp': '2025-09-05 09:13:17.159763', 'step': 3144, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:17.353448', 'step': 3144, 'epoch': 3} {'type': 'loss', 'content': 0.38393595814704895, 'timestamp': '2025-09-05 09:13:17.356113', 'step': 3145, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:17.534073', 'step': 3145, 'epoch': 3} {'type': 'loss', 'content': 0.2562781870365143, 'timestamp': '2025-09-05 09:13:17.535976', 'step': 3146, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:17.721573', 'step': 3146, 'epoch': 3} {'type': 'loss', 'content': 0.29539769887924194, 'timestamp': '2025-09-05 09:13:17.724247', 'step': 3147, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:13:17.893316', 'step': 3147, 'epoch': 3} {'type': 'loss', 'content': 0.3817562162876129, 'timestamp': '2025-09-05 09:13:17.907641', 'step': 3148, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:18.072599', 'step': 3148, 'epoch': 3} {'type': 'loss', 'content': 0.3453420102596283, 'timestamp': '2025-09-05 09:13:18.074806', 'step': 3149, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:18.210735', 'step': 3149, 'epoch': 3} {'type': 'loss', 'content': 0.4222257137298584, 'timestamp': '2025-09-05 09:13:18.212520', 'step': 3150, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:18.347726', 'step': 3150, 'epoch': 3} {'type': 'loss', 'content': 0.3828117847442627, 'timestamp': '2025-09-05 09:13:18.349613', 'step': 3151, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:18.522127', 'step': 3151, 'epoch': 3} {'type': 'loss', 'content': 0.17302794754505157, 'timestamp': '2025-09-05 09:13:18.531119', 'step': 3152, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:18.663926', 'step': 3152, 'epoch': 3} {'type': 'loss', 'content': 0.18933707475662231, 'timestamp': '2025-09-05 09:13:18.665871', 'step': 3153, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:13:18.799778', 'step': 3153, 'epoch': 3} {'type': 'loss', 'content': 0.15503232181072235, 'timestamp': '2025-09-05 09:13:18.801802', 'step': 3154, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:18.982215', 'step': 3154, 'epoch': 3} {'type': 'loss', 'content': 0.23069196939468384, 'timestamp': '2025-09-05 09:13:18.984252', 'step': 3155, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:19.148184', 'step': 3155, 'epoch': 3} {'type': 'loss', 'content': 0.2175375521183014, 'timestamp': '2025-09-05 09:13:19.162405', 'step': 3156, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:19.327253', 'step': 3156, 'epoch': 3} {'type': 'loss', 'content': 0.34681564569473267, 'timestamp': '2025-09-05 09:13:19.329228', 'step': 3157, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:19.465782', 'step': 3157, 'epoch': 3} {'type': 'loss', 'content': 0.21456918120384216, 'timestamp': '2025-09-05 09:13:19.468315', 'step': 3158, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:13:19.632293', 'step': 3158, 'epoch': 3} {'type': 'loss', 'content': 0.2504569888114929, 'timestamp': '2025-09-05 09:13:19.634342', 'step': 3159, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:19.797236', 'step': 3159, 'epoch': 3} {'type': 'loss', 'content': 0.2558198571205139, 'timestamp': '2025-09-05 09:13:19.811715', 'step': 3160, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:13:24.479387', 'step': 3160, 'epoch': 3} {'type': 'pplx', 'content': 55.59432534042321, 'timestamp': '2025-09-05 09:13:24.481354', 'step': 3160, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3160', 'timestamp': '2025-09-05 09:13:24.938644', 'step': 3160, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:25.078249', 'step': 3160, 'epoch': 3} {'type': 'loss', 'content': 0.22591379284858704, 'timestamp': '2025-09-05 09:13:25.080397', 'step': 3161, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:25.235416', 'step': 3161, 'epoch': 3} {'type': 'loss', 'content': 0.3258766233921051, 'timestamp': '2025-09-05 09:13:25.237445', 'step': 3162, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:25.393153', 'step': 3162, 'epoch': 3} {'type': 'loss', 'content': 0.20219656825065613, 'timestamp': '2025-09-05 09:13:25.395273', 'step': 3163, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:25.565963', 'step': 3163, 'epoch': 3} {'type': 'loss', 'content': 0.28959089517593384, 'timestamp': '2025-09-05 09:13:25.580379', 'step': 3164, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:13:25.734032', 'step': 3164, 'epoch': 3} {'type': 'loss', 'content': 0.4348662495613098, 'timestamp': '2025-09-05 09:13:25.736065', 'step': 3165, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:25.893585', 'step': 3165, 'epoch': 3} {'type': 'loss', 'content': 0.28287631273269653, 'timestamp': '2025-09-05 09:13:25.895614', 'step': 3166, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:26.053273', 'step': 3166, 'epoch': 3} {'type': 'loss', 'content': 0.24401001632213593, 'timestamp': '2025-09-05 09:13:26.055349', 'step': 3167, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:26.215424', 'step': 3167, 'epoch': 3} {'type': 'loss', 'content': 0.20977631211280823, 'timestamp': '2025-09-05 09:13:26.229460', 'step': 3168, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:26.381101', 'step': 3168, 'epoch': 3} {'type': 'loss', 'content': 0.2639927268028259, 'timestamp': '2025-09-05 09:13:26.383275', 'step': 3169, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:26.540967', 'step': 3169, 'epoch': 3} {'type': 'loss', 'content': 0.22925163805484772, 'timestamp': '2025-09-05 09:13:26.543319', 'step': 3170, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:26.700237', 'step': 3170, 'epoch': 3} {'type': 'loss', 'content': 0.2756401300430298, 'timestamp': '2025-09-05 09:13:26.702363', 'step': 3171, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:26.871120', 'step': 3171, 'epoch': 3} {'type': 'loss', 'content': 0.2692306935787201, 'timestamp': '2025-09-05 09:13:26.885314', 'step': 3172, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:27.045316', 'step': 3172, 'epoch': 3} {'type': 'loss', 'content': 0.29711222648620605, 'timestamp': '2025-09-05 09:13:27.047747', 'step': 3173, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:27.216850', 'step': 3173, 'epoch': 3} {'type': 'loss', 'content': 0.23208114504814148, 'timestamp': '2025-09-05 09:13:27.218956', 'step': 3174, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:27.377079', 'step': 3174, 'epoch': 3} {'type': 'loss', 'content': 0.38599449396133423, 'timestamp': '2025-09-05 09:13:27.379006', 'step': 3175, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:27.515279', 'step': 3175, 'epoch': 3} {'type': 'loss', 'content': 0.39971405267715454, 'timestamp': '2025-09-05 09:13:27.531379', 'step': 3176, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:27.691399', 'step': 3176, 'epoch': 3} {'type': 'loss', 'content': 0.19964629411697388, 'timestamp': '2025-09-05 09:13:27.694855', 'step': 3177, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:27.854835', 'step': 3177, 'epoch': 3} {'type': 'loss', 'content': 0.2849213778972626, 'timestamp': '2025-09-05 09:13:27.857053', 'step': 3178, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:28.014115', 'step': 3178, 'epoch': 3} {'type': 'loss', 'content': 0.3031027317047119, 'timestamp': '2025-09-05 09:13:28.016142', 'step': 3179, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:28.174919', 'step': 3179, 'epoch': 3} {'type': 'loss', 'content': 0.28292617201805115, 'timestamp': '2025-09-05 09:13:28.188940', 'step': 3180, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:13:32.818971', 'step': 3180, 'epoch': 3} {'type': 'pplx', 'content': 55.892830971609754, 'timestamp': '2025-09-05 09:13:32.820927', 'step': 3180, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:32.988141', 'step': 3180, 'epoch': 3} {'type': 'loss', 'content': 0.3160610496997833, 'timestamp': '2025-09-05 09:13:32.989996', 'step': 3181, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:33.192947', 'step': 3181, 'epoch': 3} {'type': 'loss', 'content': 0.39101895689964294, 'timestamp': '2025-09-05 09:13:33.194897', 'step': 3182, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:13:33.391519', 'step': 3182, 'epoch': 3} {'type': 'loss', 'content': 0.31756341457366943, 'timestamp': '2025-09-05 09:13:33.393262', 'step': 3183, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:33.598904', 'step': 3183, 'epoch': 3} {'type': 'loss', 'content': 0.38891616463661194, 'timestamp': '2025-09-05 09:13:33.612305', 'step': 3184, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:33.808124', 'step': 3184, 'epoch': 3} {'type': 'loss', 'content': 0.23683956265449524, 'timestamp': '2025-09-05 09:13:33.810090', 'step': 3185, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:34.005063', 'step': 3185, 'epoch': 3} {'type': 'loss', 'content': 0.2844909727573395, 'timestamp': '2025-09-05 09:13:34.007312', 'step': 3186, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:34.202313', 'step': 3186, 'epoch': 3} {'type': 'loss', 'content': 0.3205225467681885, 'timestamp': '2025-09-05 09:13:34.204137', 'step': 3187, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:34.406967', 'step': 3187, 'epoch': 3} {'type': 'loss', 'content': 0.2209499329328537, 'timestamp': '2025-09-05 09:13:34.422971', 'step': 3188, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:34.619317', 'step': 3188, 'epoch': 3} {'type': 'loss', 'content': 0.2793022096157074, 'timestamp': '2025-09-05 09:13:34.621318', 'step': 3189, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:34.817222', 'step': 3189, 'epoch': 3} {'type': 'loss', 'content': 0.2650418281555176, 'timestamp': '2025-09-05 09:13:34.819152', 'step': 3190, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:35.014115', 'step': 3190, 'epoch': 3} {'type': 'loss', 'content': 0.3632805347442627, 'timestamp': '2025-09-05 09:13:35.016034', 'step': 3191, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:35.210886', 'step': 3191, 'epoch': 3} {'type': 'loss', 'content': 0.1768227517604828, 'timestamp': '2025-09-05 09:13:35.224422', 'step': 3192, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:35.412596', 'step': 3192, 'epoch': 3} {'type': 'loss', 'content': 0.3283153176307678, 'timestamp': '2025-09-05 09:13:35.414386', 'step': 3193, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:35.618536', 'step': 3193, 'epoch': 3} {'type': 'loss', 'content': 0.2565184235572815, 'timestamp': '2025-09-05 09:13:35.620431', 'step': 3194, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:35.816892', 'step': 3194, 'epoch': 3} {'type': 'loss', 'content': 0.21749939024448395, 'timestamp': '2025-09-05 09:13:35.819231', 'step': 3195, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:36.022908', 'step': 3195, 'epoch': 3} {'type': 'loss', 'content': 0.20354753732681274, 'timestamp': '2025-09-05 09:13:36.037137', 'step': 3196, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:36.225881', 'step': 3196, 'epoch': 3} {'type': 'loss', 'content': 0.3543854355812073, 'timestamp': '2025-09-05 09:13:36.227551', 'step': 3197, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:36.422313', 'step': 3197, 'epoch': 3} {'type': 'loss', 'content': 0.3425697088241577, 'timestamp': '2025-09-05 09:13:36.424066', 'step': 3198, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:13:36.629872', 'step': 3198, 'epoch': 3} {'type': 'loss', 'content': 0.21106812357902527, 'timestamp': '2025-09-05 09:13:36.632198', 'step': 3199, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:36.837408', 'step': 3199, 'epoch': 3} {'type': 'loss', 'content': 0.23504656553268433, 'timestamp': '2025-09-05 09:13:36.853502', 'step': 3200, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:13:41.533859', 'step': 3200, 'epoch': 3} {'type': 'pplx', 'content': 55.82782568641178, 'timestamp': '2025-09-05 09:13:41.536162', 'step': 3200, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3200', 'timestamp': '2025-09-05 09:13:42.000609', 'step': 3200, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:13:42.185163', 'step': 3200, 'epoch': 3} {'type': 'loss', 'content': 0.2856009304523468, 'timestamp': '2025-09-05 09:13:42.188048', 'step': 3201, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:42.383871', 'step': 3201, 'epoch': 3} {'type': 'loss', 'content': 0.21880973875522614, 'timestamp': '2025-09-05 09:13:42.385968', 'step': 3202, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:42.584596', 'step': 3202, 'epoch': 3} {'type': 'loss', 'content': 0.3296593725681305, 'timestamp': '2025-09-05 09:13:42.586488', 'step': 3203, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:42.783648', 'step': 3203, 'epoch': 3} {'type': 'loss', 'content': 0.4413667321205139, 'timestamp': '2025-09-05 09:13:42.799481', 'step': 3204, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:13:42.996163', 'step': 3204, 'epoch': 3} {'type': 'loss', 'content': 0.25287240743637085, 'timestamp': '2025-09-05 09:13:42.998732', 'step': 3205, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:43.193562', 'step': 3205, 'epoch': 3} {'type': 'loss', 'content': 0.1394006609916687, 'timestamp': '2025-09-05 09:13:43.195855', 'step': 3206, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:43.392977', 'step': 3206, 'epoch': 3} {'type': 'loss', 'content': 0.3348866403102875, 'timestamp': '2025-09-05 09:13:43.395309', 'step': 3207, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:43.589743', 'step': 3207, 'epoch': 3} {'type': 'loss', 'content': 0.4713085889816284, 'timestamp': '2025-09-05 09:13:43.603613', 'step': 3208, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:43.795151', 'step': 3208, 'epoch': 3} {'type': 'loss', 'content': 0.3775988817214966, 'timestamp': '2025-09-05 09:13:43.797101', 'step': 3209, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:13:43.993547', 'step': 3209, 'epoch': 3} {'type': 'loss', 'content': 0.3147152066230774, 'timestamp': '2025-09-05 09:13:43.995458', 'step': 3210, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:44.199575', 'step': 3210, 'epoch': 3} {'type': 'loss', 'content': 0.3415358364582062, 'timestamp': '2025-09-05 09:13:44.201472', 'step': 3211, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:13:44.400141', 'step': 3211, 'epoch': 3} {'type': 'loss', 'content': 0.19705742597579956, 'timestamp': '2025-09-05 09:13:44.414020', 'step': 3212, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:44.603265', 'step': 3212, 'epoch': 3} {'type': 'loss', 'content': 0.4473814368247986, 'timestamp': '2025-09-05 09:13:44.605283', 'step': 3213, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:44.800855', 'step': 3213, 'epoch': 3} {'type': 'loss', 'content': 0.34939709305763245, 'timestamp': '2025-09-05 09:13:44.802749', 'step': 3214, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:44.998852', 'step': 3214, 'epoch': 3} {'type': 'loss', 'content': 0.39002692699432373, 'timestamp': '2025-09-05 09:13:45.000884', 'step': 3215, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:45.197315', 'step': 3215, 'epoch': 3} {'type': 'loss', 'content': 0.20467509329319, 'timestamp': '2025-09-05 09:13:45.213118', 'step': 3216, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:45.407524', 'step': 3216, 'epoch': 3} {'type': 'loss', 'content': 0.3030194044113159, 'timestamp': '2025-09-05 09:13:45.409435', 'step': 3217, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:45.604678', 'step': 3217, 'epoch': 3} {'type': 'loss', 'content': 0.27976304292678833, 'timestamp': '2025-09-05 09:13:45.606660', 'step': 3218, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:45.801975', 'step': 3218, 'epoch': 3} {'type': 'loss', 'content': 0.19097599387168884, 'timestamp': '2025-09-05 09:13:45.804011', 'step': 3219, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:45.999009', 'step': 3219, 'epoch': 3} {'type': 'loss', 'content': 0.314406156539917, 'timestamp': '2025-09-05 09:13:46.013392', 'step': 3220, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:13:50.632439', 'step': 3220, 'epoch': 3} {'type': 'pplx', 'content': 56.24416855105137, 'timestamp': '2025-09-05 09:13:50.634131', 'step': 3220, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:50.794930', 'step': 3220, 'epoch': 3} {'type': 'loss', 'content': 0.303940087556839, 'timestamp': '2025-09-05 09:13:50.798304', 'step': 3221, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:51.000115', 'step': 3221, 'epoch': 3} {'type': 'loss', 'content': 0.24126599729061127, 'timestamp': '2025-09-05 09:13:51.002152', 'step': 3222, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:51.166682', 'step': 3222, 'epoch': 3} {'type': 'loss', 'content': 0.2524254322052002, 'timestamp': '2025-09-05 09:13:51.168699', 'step': 3223, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:13:51.366062', 'step': 3223, 'epoch': 3} {'type': 'loss', 'content': 0.3770129382610321, 'timestamp': '2025-09-05 09:13:51.380480', 'step': 3224, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:51.568788', 'step': 3224, 'epoch': 3} {'type': 'loss', 'content': 0.3300042450428009, 'timestamp': '2025-09-05 09:13:51.570467', 'step': 3225, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:13:51.774578', 'step': 3225, 'epoch': 3} {'type': 'loss', 'content': 0.3154258728027344, 'timestamp': '2025-09-05 09:13:51.776918', 'step': 3226, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:13:51.980358', 'step': 3226, 'epoch': 3} {'type': 'loss', 'content': 0.14694538712501526, 'timestamp': '2025-09-05 09:13:51.982512', 'step': 3227, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:52.180855', 'step': 3227, 'epoch': 3} {'type': 'loss', 'content': 0.23194490373134613, 'timestamp': '2025-09-05 09:13:52.195702', 'step': 3228, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:52.384044', 'step': 3228, 'epoch': 3} {'type': 'loss', 'content': 0.34397757053375244, 'timestamp': '2025-09-05 09:13:52.386127', 'step': 3229, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:52.582752', 'step': 3229, 'epoch': 3} {'type': 'loss', 'content': 0.2911645472049713, 'timestamp': '2025-09-05 09:13:52.584730', 'step': 3230, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:52.779959', 'step': 3230, 'epoch': 3} {'type': 'loss', 'content': 0.4919881224632263, 'timestamp': '2025-09-05 09:13:52.782040', 'step': 3231, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:52.976859', 'step': 3231, 'epoch': 3} {'type': 'loss', 'content': 0.2355922907590866, 'timestamp': '2025-09-05 09:13:52.992677', 'step': 3232, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:13:53.187410', 'step': 3232, 'epoch': 3} {'type': 'loss', 'content': 0.36438658833503723, 'timestamp': '2025-09-05 09:13:53.189659', 'step': 3233, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:13:53.395388', 'step': 3233, 'epoch': 3} {'type': 'loss', 'content': 0.18304933607578278, 'timestamp': '2025-09-05 09:13:53.398335', 'step': 3234, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:13:53.597527', 'step': 3234, 'epoch': 3} {'type': 'loss', 'content': 0.27435052394866943, 'timestamp': '2025-09-05 09:13:53.599413', 'step': 3235, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:13:53.804343', 'step': 3235, 'epoch': 3} {'type': 'loss', 'content': 0.3234035074710846, 'timestamp': '2025-09-05 09:13:53.818643', 'step': 3236, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:13:54.009349', 'step': 3236, 'epoch': 3} {'type': 'loss', 'content': 0.44214802980422974, 'timestamp': '2025-09-05 09:13:54.011547', 'step': 3237, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:13:54.215298', 'step': 3237, 'epoch': 3} {'type': 'loss', 'content': 0.19979646801948547, 'timestamp': '2025-09-05 09:13:54.217753', 'step': 3238, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:54.422847', 'step': 3238, 'epoch': 3} {'type': 'loss', 'content': 0.24443750083446503, 'timestamp': '2025-09-05 09:13:54.424945', 'step': 3239, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:13:54.628788', 'step': 3239, 'epoch': 3} {'type': 'loss', 'content': 0.26840123534202576, 'timestamp': '2025-09-05 09:13:54.641842', 'step': 3240, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:13:59.277876', 'step': 3240, 'epoch': 3} {'type': 'pplx', 'content': 56.90161940514098, 'timestamp': '2025-09-05 09:13:59.280406', 'step': 3240, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3240', 'timestamp': '2025-09-05 09:13:59.748306', 'step': 3240, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:13:59.918039', 'step': 3240, 'epoch': 3} {'type': 'loss', 'content': 0.4399571120738983, 'timestamp': '2025-09-05 09:13:59.920498', 'step': 3241, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:00.124459', 'step': 3241, 'epoch': 3} {'type': 'loss', 'content': 0.26113492250442505, 'timestamp': '2025-09-05 09:14:00.126551', 'step': 3242, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:00.322918', 'step': 3242, 'epoch': 3} {'type': 'loss', 'content': 0.255985289812088, 'timestamp': '2025-09-05 09:14:00.326082', 'step': 3243, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:00.530352', 'step': 3243, 'epoch': 3} {'type': 'loss', 'content': 0.34716635942459106, 'timestamp': '2025-09-05 09:14:00.544313', 'step': 3244, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:00.732575', 'step': 3244, 'epoch': 3} {'type': 'loss', 'content': 0.24577327072620392, 'timestamp': '2025-09-05 09:14:00.735048', 'step': 3245, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:00.929746', 'step': 3245, 'epoch': 3} {'type': 'loss', 'content': 0.3293505907058716, 'timestamp': '2025-09-05 09:14:00.931782', 'step': 3246, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:01.127220', 'step': 3246, 'epoch': 3} {'type': 'loss', 'content': 0.3006509840488434, 'timestamp': '2025-09-05 09:14:01.129383', 'step': 3247, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:01.321083', 'step': 3247, 'epoch': 3} {'type': 'loss', 'content': 0.29109349846839905, 'timestamp': '2025-09-05 09:14:01.334922', 'step': 3248, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:01.531797', 'step': 3248, 'epoch': 3} {'type': 'loss', 'content': 0.19694913923740387, 'timestamp': '2025-09-05 09:14:01.534299', 'step': 3249, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:01.733757', 'step': 3249, 'epoch': 3} {'type': 'loss', 'content': 0.3834435045719147, 'timestamp': '2025-09-05 09:14:01.735505', 'step': 3250, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:01.938130', 'step': 3250, 'epoch': 3} {'type': 'loss', 'content': 0.23048020899295807, 'timestamp': '2025-09-05 09:14:01.940204', 'step': 3251, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:02.135458', 'step': 3251, 'epoch': 3} {'type': 'loss', 'content': 0.3646623492240906, 'timestamp': '2025-09-05 09:14:02.150277', 'step': 3252, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:14:02.339019', 'step': 3252, 'epoch': 3} {'type': 'loss', 'content': 0.3779314160346985, 'timestamp': '2025-09-05 09:14:02.341680', 'step': 3253, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:02.537969', 'step': 3253, 'epoch': 3} {'type': 'loss', 'content': 0.22319437563419342, 'timestamp': '2025-09-05 09:14:02.540092', 'step': 3254, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:02.737611', 'step': 3254, 'epoch': 3} {'type': 'loss', 'content': 0.3502451479434967, 'timestamp': '2025-09-05 09:14:02.739738', 'step': 3255, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:02.934475', 'step': 3255, 'epoch': 3} {'type': 'loss', 'content': 0.3203504681587219, 'timestamp': '2025-09-05 09:14:02.950514', 'step': 3256, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:03.146960', 'step': 3256, 'epoch': 3} {'type': 'loss', 'content': 0.2472524791955948, 'timestamp': '2025-09-05 09:14:03.149188', 'step': 3257, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:03.344244', 'step': 3257, 'epoch': 3} {'type': 'loss', 'content': 0.2531106770038605, 'timestamp': '2025-09-05 09:14:03.346032', 'step': 3258, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:14:03.552245', 'step': 3258, 'epoch': 3} {'type': 'loss', 'content': 0.4314362108707428, 'timestamp': '2025-09-05 09:14:03.554754', 'step': 3259, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:03.752049', 'step': 3259, 'epoch': 3} {'type': 'loss', 'content': 0.2161693572998047, 'timestamp': '2025-09-05 09:14:03.765613', 'step': 3260, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:14:08.486975', 'step': 3260, 'epoch': 3} {'type': 'pplx', 'content': 56.99661515636932, 'timestamp': '2025-09-05 09:14:08.493687', 'step': 3260, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:08.628509', 'step': 3260, 'epoch': 3} {'type': 'loss', 'content': 0.23275181651115417, 'timestamp': '2025-09-05 09:14:08.630855', 'step': 3261, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:08.793165', 'step': 3261, 'epoch': 3} {'type': 'loss', 'content': 0.4570353329181671, 'timestamp': '2025-09-05 09:14:08.797479', 'step': 3262, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:08.967300', 'step': 3262, 'epoch': 3} {'type': 'loss', 'content': 0.2490944266319275, 'timestamp': '2025-09-05 09:14:08.976549', 'step': 3263, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:09.140541', 'step': 3263, 'epoch': 3} {'type': 'loss', 'content': 0.28627151250839233, 'timestamp': '2025-09-05 09:14:09.160239', 'step': 3264, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:14:09.335688', 'step': 3264, 'epoch': 3} {'type': 'loss', 'content': 0.3446066677570343, 'timestamp': '2025-09-05 09:14:09.338447', 'step': 3265, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:09.519039', 'step': 3265, 'epoch': 3} {'type': 'loss', 'content': 0.2337915450334549, 'timestamp': '2025-09-05 09:14:09.528948', 'step': 3266, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:09.752462', 'step': 3266, 'epoch': 3} {'type': 'loss', 'content': 0.2857224941253662, 'timestamp': '2025-09-05 09:14:09.756325', 'step': 3267, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:09.992777', 'step': 3267, 'epoch': 3} {'type': 'loss', 'content': 0.306959867477417, 'timestamp': '2025-09-05 09:14:10.021273', 'step': 3268, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:10.249649', 'step': 3268, 'epoch': 3} {'type': 'loss', 'content': 0.2844340205192566, 'timestamp': '2025-09-05 09:14:10.252493', 'step': 3269, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:10.449235', 'step': 3269, 'epoch': 3} {'type': 'loss', 'content': 0.23272141814231873, 'timestamp': '2025-09-05 09:14:10.452990', 'step': 3270, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:10.647682', 'step': 3270, 'epoch': 3} {'type': 'loss', 'content': 0.2352897822856903, 'timestamp': '2025-09-05 09:14:10.650540', 'step': 3271, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:10.842725', 'step': 3271, 'epoch': 3} {'type': 'loss', 'content': 0.23539894819259644, 'timestamp': '2025-09-05 09:14:10.855848', 'step': 3272, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:11.043914', 'step': 3272, 'epoch': 3} {'type': 'loss', 'content': 0.27581384778022766, 'timestamp': '2025-09-05 09:14:11.049234', 'step': 3273, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:14:11.253580', 'step': 3273, 'epoch': 3} {'type': 'loss', 'content': 0.28692498803138733, 'timestamp': '2025-09-05 09:14:11.255575', 'step': 3274, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:11.459964', 'step': 3274, 'epoch': 3} {'type': 'loss', 'content': 0.2923082411289215, 'timestamp': '2025-09-05 09:14:11.464675', 'step': 3275, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:11.663448', 'step': 3275, 'epoch': 3} {'type': 'loss', 'content': 0.19943730533123016, 'timestamp': '2025-09-05 09:14:11.678177', 'step': 3276, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:14:11.867747', 'step': 3276, 'epoch': 3} {'type': 'loss', 'content': 0.33723267912864685, 'timestamp': '2025-09-05 09:14:11.869804', 'step': 3277, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:12.066298', 'step': 3277, 'epoch': 3} {'type': 'loss', 'content': 0.28404057025909424, 'timestamp': '2025-09-05 09:14:12.068253', 'step': 3278, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:12.270783', 'step': 3278, 'epoch': 3} {'type': 'loss', 'content': 0.4263806939125061, 'timestamp': '2025-09-05 09:14:12.273001', 'step': 3279, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:14:12.466169', 'step': 3279, 'epoch': 3} {'type': 'loss', 'content': 0.26628220081329346, 'timestamp': '2025-09-05 09:14:12.480160', 'step': 3280, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:14:17.120270', 'step': 3280, 'epoch': 3} {'type': 'pplx', 'content': 57.134940070215634, 'timestamp': '2025-09-05 09:14:17.122321', 'step': 3280, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3280', 'timestamp': '2025-09-05 09:14:17.606762', 'step': 3280, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:14:17.775913', 'step': 3280, 'epoch': 3} {'type': 'loss', 'content': 0.25822189450263977, 'timestamp': '2025-09-05 09:14:17.778034', 'step': 3281, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:17.972040', 'step': 3281, 'epoch': 3} {'type': 'loss', 'content': 0.3605766296386719, 'timestamp': '2025-09-05 09:14:17.974036', 'step': 3282, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:18.168615', 'step': 3282, 'epoch': 3} {'type': 'loss', 'content': 0.16432644426822662, 'timestamp': '2025-09-05 09:14:18.170672', 'step': 3283, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:18.337349', 'step': 3283, 'epoch': 3} {'type': 'loss', 'content': 0.36961498856544495, 'timestamp': '2025-09-05 09:14:18.353700', 'step': 3284, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 09:14:18.550693', 'step': 3284, 'epoch': 3} {'type': 'loss', 'content': 0.314778596162796, 'timestamp': '2025-09-05 09:14:18.552728', 'step': 3285, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:18.747988', 'step': 3285, 'epoch': 3} {'type': 'loss', 'content': 0.2686954736709595, 'timestamp': '2025-09-05 09:14:18.750449', 'step': 3286, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:14:18.914285', 'step': 3286, 'epoch': 3} {'type': 'loss', 'content': 0.21251261234283447, 'timestamp': '2025-09-05 09:14:18.916301', 'step': 3287, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:19.116973', 'step': 3287, 'epoch': 3} {'type': 'loss', 'content': 0.16383251547813416, 'timestamp': '2025-09-05 09:14:19.126361', 'step': 3288, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:19.286046', 'step': 3288, 'epoch': 3} {'type': 'loss', 'content': 0.2723452150821686, 'timestamp': '2025-09-05 09:14:19.288450', 'step': 3289, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:19.494176', 'step': 3289, 'epoch': 3} {'type': 'loss', 'content': 0.19908295571804047, 'timestamp': '2025-09-05 09:14:19.495960', 'step': 3290, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:19.699764', 'step': 3290, 'epoch': 3} {'type': 'loss', 'content': 0.3640027940273285, 'timestamp': '2025-09-05 09:14:19.701833', 'step': 3291, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:19.905047', 'step': 3291, 'epoch': 3} {'type': 'loss', 'content': 0.29716038703918457, 'timestamp': '2025-09-05 09:14:19.921246', 'step': 3292, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:20.117897', 'step': 3292, 'epoch': 3} {'type': 'loss', 'content': 0.23596565425395966, 'timestamp': '2025-09-05 09:14:20.122259', 'step': 3293, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 09:14:20.328057', 'step': 3293, 'epoch': 3} {'type': 'loss', 'content': 0.3065091371536255, 'timestamp': '2025-09-05 09:14:20.330125', 'step': 3294, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:20.525257', 'step': 3294, 'epoch': 3} {'type': 'loss', 'content': 0.29849687218666077, 'timestamp': '2025-09-05 09:14:20.528075', 'step': 3295, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:20.734713', 'step': 3295, 'epoch': 3} {'type': 'loss', 'content': 0.2795962393283844, 'timestamp': '2025-09-05 09:14:20.748329', 'step': 3296, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:20.936480', 'step': 3296, 'epoch': 3} {'type': 'loss', 'content': 0.32312214374542236, 'timestamp': '2025-09-05 09:14:20.938447', 'step': 3297, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:21.134729', 'step': 3297, 'epoch': 3} {'type': 'loss', 'content': 0.310803085565567, 'timestamp': '2025-09-05 09:14:21.136747', 'step': 3298, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:21.332732', 'step': 3298, 'epoch': 3} {'type': 'loss', 'content': 0.26474887132644653, 'timestamp': '2025-09-05 09:14:21.335441', 'step': 3299, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:14:21.501888', 'step': 3299, 'epoch': 3} {'type': 'loss', 'content': 0.25056520104408264, 'timestamp': '2025-09-05 09:14:21.518681', 'step': 3300, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:14:26.147488', 'step': 3300, 'epoch': 3} {'type': 'pplx', 'content': 57.88412892104688, 'timestamp': '2025-09-05 09:14:26.149332', 'step': 3300, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:26.309641', 'step': 3300, 'epoch': 3} {'type': 'loss', 'content': 0.24012216925621033, 'timestamp': '2025-09-05 09:14:26.311702', 'step': 3301, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:26.477497', 'step': 3301, 'epoch': 3} {'type': 'loss', 'content': 0.3001227378845215, 'timestamp': '2025-09-05 09:14:26.479473', 'step': 3302, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:26.684596', 'step': 3302, 'epoch': 3} {'type': 'loss', 'content': 0.39674386382102966, 'timestamp': '2025-09-05 09:14:26.686953', 'step': 3303, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:26.882026', 'step': 3303, 'epoch': 3} {'type': 'loss', 'content': 0.22072692215442657, 'timestamp': '2025-09-05 09:14:26.895795', 'step': 3304, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:27.083043', 'step': 3304, 'epoch': 3} {'type': 'loss', 'content': 0.2948031723499298, 'timestamp': '2025-09-05 09:14:27.084945', 'step': 3305, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:27.279441', 'step': 3305, 'epoch': 3} {'type': 'loss', 'content': 0.28666290640830994, 'timestamp': '2025-09-05 09:14:27.281505', 'step': 3306, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:27.476574', 'step': 3306, 'epoch': 3} {'type': 'loss', 'content': 0.3318125903606415, 'timestamp': '2025-09-05 09:14:27.478516', 'step': 3307, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:27.673538', 'step': 3307, 'epoch': 3} {'type': 'loss', 'content': 0.20074622333049774, 'timestamp': '2025-09-05 09:14:27.687577', 'step': 3308, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:27.875149', 'step': 3308, 'epoch': 3} {'type': 'loss', 'content': 0.34110990166664124, 'timestamp': '2025-09-05 09:14:27.877311', 'step': 3309, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:28.073214', 'step': 3309, 'epoch': 3} {'type': 'loss', 'content': 0.31203046441078186, 'timestamp': '2025-09-05 09:14:28.075308', 'step': 3310, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:14:28.270514', 'step': 3310, 'epoch': 3} {'type': 'loss', 'content': 0.3481146991252899, 'timestamp': '2025-09-05 09:14:28.272785', 'step': 3311, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:28.467367', 'step': 3311, 'epoch': 3} {'type': 'loss', 'content': 0.2804660201072693, 'timestamp': '2025-09-05 09:14:28.481098', 'step': 3312, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:28.669353', 'step': 3312, 'epoch': 3} {'type': 'loss', 'content': 0.30905982851982117, 'timestamp': '2025-09-05 09:14:28.671337', 'step': 3313, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:28.875523', 'step': 3313, 'epoch': 3} {'type': 'loss', 'content': 0.3836857080459595, 'timestamp': '2025-09-05 09:14:28.877566', 'step': 3314, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:29.072635', 'step': 3314, 'epoch': 3} {'type': 'loss', 'content': 0.20405101776123047, 'timestamp': '2025-09-05 09:14:29.075571', 'step': 3315, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:29.271910', 'step': 3315, 'epoch': 3} {'type': 'loss', 'content': 0.2559118866920471, 'timestamp': '2025-09-05 09:14:29.285986', 'step': 3316, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:29.473237', 'step': 3316, 'epoch': 3} {'type': 'loss', 'content': 0.24376524984836578, 'timestamp': '2025-09-05 09:14:29.474965', 'step': 3317, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:14:29.679559', 'step': 3317, 'epoch': 3} {'type': 'loss', 'content': 0.36361056566238403, 'timestamp': '2025-09-05 09:14:29.681590', 'step': 3318, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:29.877224', 'step': 3318, 'epoch': 3} {'type': 'loss', 'content': 0.21259449422359467, 'timestamp': '2025-09-05 09:14:29.879560', 'step': 3319, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:14:30.084812', 'step': 3319, 'epoch': 3} {'type': 'loss', 'content': 0.13015785813331604, 'timestamp': '2025-09-05 09:14:30.097888', 'step': 3320, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:14:34.729703', 'step': 3320, 'epoch': 3} {'type': 'pplx', 'content': 58.29189660131222, 'timestamp': '2025-09-05 09:14:34.731755', 'step': 3320, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3320', 'timestamp': '2025-09-05 09:14:35.184580', 'step': 3320, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:35.352584', 'step': 3320, 'epoch': 3} {'type': 'loss', 'content': 0.24831537902355194, 'timestamp': '2025-09-05 09:14:35.354845', 'step': 3321, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:14:35.521604', 'step': 3321, 'epoch': 3} {'type': 'loss', 'content': 0.3621161878108978, 'timestamp': '2025-09-05 09:14:35.523647', 'step': 3322, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:35.719190', 'step': 3322, 'epoch': 3} {'type': 'loss', 'content': 0.39665305614471436, 'timestamp': '2025-09-05 09:14:35.720906', 'step': 3323, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:35.914640', 'step': 3323, 'epoch': 3} {'type': 'loss', 'content': 0.2833714187145233, 'timestamp': '2025-09-05 09:14:35.927820', 'step': 3324, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:36.115552', 'step': 3324, 'epoch': 3} {'type': 'loss', 'content': 0.24043862521648407, 'timestamp': '2025-09-05 09:14:36.117529', 'step': 3325, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:36.322088', 'step': 3325, 'epoch': 3} {'type': 'loss', 'content': 0.30250465869903564, 'timestamp': '2025-09-05 09:14:36.324460', 'step': 3326, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:36.521003', 'step': 3326, 'epoch': 3} {'type': 'loss', 'content': 0.2354232370853424, 'timestamp': '2025-09-05 09:14:36.523015', 'step': 3327, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:36.716681', 'step': 3327, 'epoch': 3} {'type': 'loss', 'content': 0.2318793684244156, 'timestamp': '2025-09-05 09:14:36.730420', 'step': 3328, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:36.918065', 'step': 3328, 'epoch': 3} {'type': 'loss', 'content': 0.40657877922058105, 'timestamp': '2025-09-05 09:14:36.920782', 'step': 3329, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:14:37.116457', 'step': 3329, 'epoch': 3} {'type': 'loss', 'content': 0.33501723408699036, 'timestamp': '2025-09-05 09:14:37.122192', 'step': 3330, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:37.288149', 'step': 3330, 'epoch': 3} {'type': 'loss', 'content': 0.3885265290737152, 'timestamp': '2025-09-05 09:14:37.290154', 'step': 3331, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:37.484428', 'step': 3331, 'epoch': 3} {'type': 'loss', 'content': 0.2885796129703522, 'timestamp': '2025-09-05 09:14:37.498324', 'step': 3332, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:37.693667', 'step': 3332, 'epoch': 3} {'type': 'loss', 'content': 0.20705479383468628, 'timestamp': '2025-09-05 09:14:37.695563', 'step': 3333, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:14:37.891306', 'step': 3333, 'epoch': 3} {'type': 'loss', 'content': 0.35460859537124634, 'timestamp': '2025-09-05 09:14:37.893113', 'step': 3334, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:14:38.056266', 'step': 3334, 'epoch': 3} {'type': 'loss', 'content': 0.2885071039199829, 'timestamp': '2025-09-05 09:14:38.058140', 'step': 3335, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:38.262325', 'step': 3335, 'epoch': 3} {'type': 'loss', 'content': 0.31416335701942444, 'timestamp': '2025-09-05 09:14:38.276155', 'step': 3336, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:14:38.465017', 'step': 3336, 'epoch': 3} {'type': 'loss', 'content': 0.3768198788166046, 'timestamp': '2025-09-05 09:14:38.467262', 'step': 3337, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:38.662768', 'step': 3337, 'epoch': 3} {'type': 'loss', 'content': 0.20448677241802216, 'timestamp': '2025-09-05 09:14:38.665904', 'step': 3338, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:38.867273', 'step': 3338, 'epoch': 3} {'type': 'loss', 'content': 0.18872617185115814, 'timestamp': '2025-09-05 09:14:38.869675', 'step': 3339, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:39.064688', 'step': 3339, 'epoch': 3} {'type': 'loss', 'content': 0.2098010927438736, 'timestamp': '2025-09-05 09:14:39.077793', 'step': 3340, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:14:43.699963', 'step': 3340, 'epoch': 3} {'type': 'pplx', 'content': 57.79616724128643, 'timestamp': '2025-09-05 09:14:43.702238', 'step': 3340, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:43.862535', 'step': 3340, 'epoch': 3} {'type': 'loss', 'content': 0.1867901235818863, 'timestamp': '2025-09-05 09:14:43.864350', 'step': 3341, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:44.027936', 'step': 3341, 'epoch': 3} {'type': 'loss', 'content': 0.221108078956604, 'timestamp': '2025-09-05 09:14:44.030146', 'step': 3342, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:44.231856', 'step': 3342, 'epoch': 3} {'type': 'loss', 'content': 0.34810617566108704, 'timestamp': '2025-09-05 09:14:44.234406', 'step': 3343, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:14:44.431426', 'step': 3343, 'epoch': 3} {'type': 'loss', 'content': 0.2502887547016144, 'timestamp': '2025-09-05 09:14:44.445148', 'step': 3344, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:14:44.633890', 'step': 3344, 'epoch': 3} {'type': 'loss', 'content': 0.29671764373779297, 'timestamp': '2025-09-05 09:14:44.635930', 'step': 3345, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:44.827997', 'step': 3345, 'epoch': 3} {'type': 'loss', 'content': 0.37294748425483704, 'timestamp': '2025-09-05 09:14:44.831072', 'step': 3346, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:45.025566', 'step': 3346, 'epoch': 3} {'type': 'loss', 'content': 0.1889505237340927, 'timestamp': '2025-09-05 09:14:45.028729', 'step': 3347, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:45.221524', 'step': 3347, 'epoch': 3} {'type': 'loss', 'content': 0.3558424711227417, 'timestamp': '2025-09-05 09:14:45.236034', 'step': 3348, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:45.418501', 'step': 3348, 'epoch': 3} {'type': 'loss', 'content': 0.334084689617157, 'timestamp': '2025-09-05 09:14:45.421506', 'step': 3349, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:45.616166', 'step': 3349, 'epoch': 3} {'type': 'loss', 'content': 0.3016354739665985, 'timestamp': '2025-09-05 09:14:45.618253', 'step': 3350, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:45.812162', 'step': 3350, 'epoch': 3} {'type': 'loss', 'content': 0.32526397705078125, 'timestamp': '2025-09-05 09:14:45.814521', 'step': 3351, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:46.010569', 'step': 3351, 'epoch': 3} {'type': 'loss', 'content': 0.3452492952346802, 'timestamp': '2025-09-05 09:14:46.025210', 'step': 3352, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:14:46.214549', 'step': 3352, 'epoch': 3} {'type': 'loss', 'content': 0.2737857401371002, 'timestamp': '2025-09-05 09:14:46.216639', 'step': 3353, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:46.421245', 'step': 3353, 'epoch': 3} {'type': 'loss', 'content': 0.28672438859939575, 'timestamp': '2025-09-05 09:14:46.423862', 'step': 3354, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:46.619848', 'step': 3354, 'epoch': 3} {'type': 'loss', 'content': 0.2686102092266083, 'timestamp': '2025-09-05 09:14:46.622311', 'step': 3355, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:46.817721', 'step': 3355, 'epoch': 3} {'type': 'loss', 'content': 0.5274847745895386, 'timestamp': '2025-09-05 09:14:46.831979', 'step': 3356, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:14:47.020822', 'step': 3356, 'epoch': 3} {'type': 'loss', 'content': 0.3943261206150055, 'timestamp': '2025-09-05 09:14:47.022852', 'step': 3357, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:47.226539', 'step': 3357, 'epoch': 3} {'type': 'loss', 'content': 0.29574286937713623, 'timestamp': '2025-09-05 09:14:47.228426', 'step': 3358, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:47.423390', 'step': 3358, 'epoch': 3} {'type': 'loss', 'content': 0.22139273583889008, 'timestamp': '2025-09-05 09:14:47.425194', 'step': 3359, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:47.630502', 'step': 3359, 'epoch': 3} {'type': 'loss', 'content': 0.22906312346458435, 'timestamp': '2025-09-05 09:14:47.643775', 'step': 3360, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:14:52.285789', 'step': 3360, 'epoch': 3} {'type': 'pplx', 'content': 56.80259232058196, 'timestamp': '2025-09-05 09:14:52.287807', 'step': 3360, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3360', 'timestamp': '2025-09-05 09:14:52.754322', 'step': 3360, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:52.922250', 'step': 3360, 'epoch': 3} {'type': 'loss', 'content': 0.2129780501127243, 'timestamp': '2025-09-05 09:14:52.924346', 'step': 3361, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:53.126998', 'step': 3361, 'epoch': 3} {'type': 'loss', 'content': 0.2788529396057129, 'timestamp': '2025-09-05 09:14:53.129039', 'step': 3362, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:53.323035', 'step': 3362, 'epoch': 3} {'type': 'loss', 'content': 0.1642555296421051, 'timestamp': '2025-09-05 09:14:53.325772', 'step': 3363, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:53.529352', 'step': 3363, 'epoch': 3} {'type': 'loss', 'content': 0.2505939304828644, 'timestamp': '2025-09-05 09:14:53.544030', 'step': 3364, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:53.733373', 'step': 3364, 'epoch': 3} {'type': 'loss', 'content': 0.3860357403755188, 'timestamp': '2025-09-05 09:14:53.735306', 'step': 3365, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:53.902689', 'step': 3365, 'epoch': 3} {'type': 'loss', 'content': 0.2575131058692932, 'timestamp': '2025-09-05 09:14:53.904636', 'step': 3366, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:54.111034', 'step': 3366, 'epoch': 3} {'type': 'loss', 'content': 0.2468288540840149, 'timestamp': '2025-09-05 09:14:54.113207', 'step': 3367, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:54.280129', 'step': 3367, 'epoch': 3} {'type': 'loss', 'content': 0.21068817377090454, 'timestamp': '2025-09-05 09:14:54.296230', 'step': 3368, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:54.490215', 'step': 3368, 'epoch': 3} {'type': 'loss', 'content': 0.3015161454677582, 'timestamp': '2025-09-05 09:14:54.492249', 'step': 3369, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:54.695993', 'step': 3369, 'epoch': 3} {'type': 'loss', 'content': 0.32386377453804016, 'timestamp': '2025-09-05 09:14:54.698406', 'step': 3370, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:14:54.905151', 'step': 3370, 'epoch': 3} {'type': 'loss', 'content': 0.3242306709289551, 'timestamp': '2025-09-05 09:14:54.910037', 'step': 3371, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:55.127944', 'step': 3371, 'epoch': 3} {'type': 'loss', 'content': 0.3697621524333954, 'timestamp': '2025-09-05 09:14:55.142607', 'step': 3372, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:14:55.330924', 'step': 3372, 'epoch': 3} {'type': 'loss', 'content': 0.2763964831829071, 'timestamp': '2025-09-05 09:14:55.332840', 'step': 3373, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:55.536582', 'step': 3373, 'epoch': 3} {'type': 'loss', 'content': 0.237064391374588, 'timestamp': '2025-09-05 09:14:55.538791', 'step': 3374, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:55.744391', 'step': 3374, 'epoch': 3} {'type': 'loss', 'content': 0.3390370309352875, 'timestamp': '2025-09-05 09:14:55.746411', 'step': 3375, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:55.941059', 'step': 3375, 'epoch': 3} {'type': 'loss', 'content': 0.3362955152988434, 'timestamp': '2025-09-05 09:14:55.954866', 'step': 3376, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:14:56.144253', 'step': 3376, 'epoch': 3} {'type': 'loss', 'content': 0.3102271854877472, 'timestamp': '2025-09-05 09:14:56.146687', 'step': 3377, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:14:56.349031', 'step': 3377, 'epoch': 3} {'type': 'loss', 'content': 0.2332654595375061, 'timestamp': '2025-09-05 09:14:56.351094', 'step': 3378, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:14:56.546133', 'step': 3378, 'epoch': 3} {'type': 'loss', 'content': 0.297242134809494, 'timestamp': '2025-09-05 09:14:56.548284', 'step': 3379, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:14:56.743311', 'step': 3379, 'epoch': 3} {'type': 'loss', 'content': 0.3286650478839874, 'timestamp': '2025-09-05 09:14:56.756242', 'step': 3380, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:15:01.458280', 'step': 3380, 'epoch': 3} {'type': 'pplx', 'content': 57.62159304092415, 'timestamp': '2025-09-05 09:15:01.460429', 'step': 3380, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:01.620078', 'step': 3380, 'epoch': 3} {'type': 'loss', 'content': 0.35747459530830383, 'timestamp': '2025-09-05 09:15:01.622312', 'step': 3381, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:01.788171', 'step': 3381, 'epoch': 3} {'type': 'loss', 'content': 0.21469105780124664, 'timestamp': '2025-09-05 09:15:01.789982', 'step': 3382, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:01.964218', 'step': 3382, 'epoch': 3} {'type': 'loss', 'content': 0.23298673331737518, 'timestamp': '2025-09-05 09:15:01.966641', 'step': 3383, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:02.175542', 'step': 3383, 'epoch': 3} {'type': 'loss', 'content': 0.20766644179821014, 'timestamp': '2025-09-05 09:15:02.191896', 'step': 3384, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:02.389069', 'step': 3384, 'epoch': 3} {'type': 'loss', 'content': 0.3102535009384155, 'timestamp': '2025-09-05 09:15:02.391536', 'step': 3385, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:02.588812', 'step': 3385, 'epoch': 3} {'type': 'loss', 'content': 0.23322685062885284, 'timestamp': '2025-09-05 09:15:02.591193', 'step': 3386, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:02.787088', 'step': 3386, 'epoch': 3} {'type': 'loss', 'content': 0.2468855381011963, 'timestamp': '2025-09-05 09:15:02.789001', 'step': 3387, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:02.984473', 'step': 3387, 'epoch': 3} {'type': 'loss', 'content': 0.34562644362449646, 'timestamp': '2025-09-05 09:15:02.998435', 'step': 3388, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:15:03.188465', 'step': 3388, 'epoch': 3} {'type': 'loss', 'content': 0.27134451270103455, 'timestamp': '2025-09-05 09:15:03.190918', 'step': 3389, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:03.395100', 'step': 3389, 'epoch': 3} {'type': 'loss', 'content': 0.28415605425834656, 'timestamp': '2025-09-05 09:15:03.397109', 'step': 3390, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:03.561477', 'step': 3390, 'epoch': 3} {'type': 'loss', 'content': 0.24175278842449188, 'timestamp': '2025-09-05 09:15:03.563253', 'step': 3391, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:03.768969', 'step': 3391, 'epoch': 3} {'type': 'loss', 'content': 0.2102387249469757, 'timestamp': '2025-09-05 09:15:03.786745', 'step': 3392, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:03.983685', 'step': 3392, 'epoch': 3} {'type': 'loss', 'content': 0.32016080617904663, 'timestamp': '2025-09-05 09:15:03.986311', 'step': 3393, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:04.181398', 'step': 3393, 'epoch': 3} {'type': 'loss', 'content': 0.20062977075576782, 'timestamp': '2025-09-05 09:15:04.183780', 'step': 3394, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:04.387838', 'step': 3394, 'epoch': 3} {'type': 'loss', 'content': 0.3682517409324646, 'timestamp': '2025-09-05 09:15:04.389851', 'step': 3395, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:04.553368', 'step': 3395, 'epoch': 3} {'type': 'loss', 'content': 0.341085284948349, 'timestamp': '2025-09-05 09:15:04.569588', 'step': 3396, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:15:04.765544', 'step': 3396, 'epoch': 3} {'type': 'loss', 'content': 0.32158076763153076, 'timestamp': '2025-09-05 09:15:04.767464', 'step': 3397, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:04.932611', 'step': 3397, 'epoch': 3} {'type': 'loss', 'content': 0.25980469584465027, 'timestamp': '2025-09-05 09:15:04.934471', 'step': 3398, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:05.139083', 'step': 3398, 'epoch': 3} {'type': 'loss', 'content': 0.2603665292263031, 'timestamp': '2025-09-05 09:15:05.140946', 'step': 3399, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:05.305222', 'step': 3399, 'epoch': 3} {'type': 'loss', 'content': 0.3023158311843872, 'timestamp': '2025-09-05 09:15:05.324842', 'step': 3400, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:15:10.151502', 'step': 3400, 'epoch': 3} {'type': 'pplx', 'content': 57.15140966805781, 'timestamp': '2025-09-05 09:15:10.153894', 'step': 3400, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3400', 'timestamp': '2025-09-05 09:15:10.623749', 'step': 3400, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:15:10.790882', 'step': 3400, 'epoch': 3} {'type': 'loss', 'content': 0.2876795530319214, 'timestamp': '2025-09-05 09:15:10.793809', 'step': 3401, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:10.959766', 'step': 3401, 'epoch': 3} {'type': 'loss', 'content': 0.20607417821884155, 'timestamp': '2025-09-05 09:15:10.962061', 'step': 3402, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:11.168326', 'step': 3402, 'epoch': 3} {'type': 'loss', 'content': 0.2529613673686981, 'timestamp': '2025-09-05 09:15:11.170402', 'step': 3403, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:11.366376', 'step': 3403, 'epoch': 3} {'type': 'loss', 'content': 0.2774178385734558, 'timestamp': '2025-09-05 09:15:11.380522', 'step': 3404, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:11.570006', 'step': 3404, 'epoch': 3} {'type': 'loss', 'content': 0.23143890500068665, 'timestamp': '2025-09-05 09:15:11.572036', 'step': 3405, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:11.767214', 'step': 3405, 'epoch': 3} {'type': 'loss', 'content': 0.2894435226917267, 'timestamp': '2025-09-05 09:15:11.769237', 'step': 3406, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:11.934854', 'step': 3406, 'epoch': 3} {'type': 'loss', 'content': 0.28832849860191345, 'timestamp': '2025-09-05 09:15:11.937495', 'step': 3407, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:12.141320', 'step': 3407, 'epoch': 3} {'type': 'loss', 'content': 0.3540599048137665, 'timestamp': '2025-09-05 09:15:12.157930', 'step': 3408, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:15:12.356207', 'step': 3408, 'epoch': 3} {'type': 'loss', 'content': 0.4962592124938965, 'timestamp': '2025-09-05 09:15:12.359073', 'step': 3409, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:12.562077', 'step': 3409, 'epoch': 3} {'type': 'loss', 'content': 0.3610974848270416, 'timestamp': '2025-09-05 09:15:12.564088', 'step': 3410, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:12.760083', 'step': 3410, 'epoch': 3} {'type': 'loss', 'content': 0.238719180226326, 'timestamp': '2025-09-05 09:15:12.762436', 'step': 3411, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:12.958223', 'step': 3411, 'epoch': 3} {'type': 'loss', 'content': 0.3288671374320984, 'timestamp': '2025-09-05 09:15:12.972265', 'step': 3412, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:13.169674', 'step': 3412, 'epoch': 3} {'type': 'loss', 'content': 0.32551729679107666, 'timestamp': '2025-09-05 09:15:13.172045', 'step': 3413, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:13.378475', 'step': 3413, 'epoch': 3} {'type': 'loss', 'content': 0.24862384796142578, 'timestamp': '2025-09-05 09:15:13.380797', 'step': 3414, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:13.576666', 'step': 3414, 'epoch': 3} {'type': 'loss', 'content': 0.28981438279151917, 'timestamp': '2025-09-05 09:15:13.578607', 'step': 3415, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:13.783393', 'step': 3415, 'epoch': 3} {'type': 'loss', 'content': 0.2465643584728241, 'timestamp': '2025-09-05 09:15:13.797643', 'step': 3416, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:13.990010', 'step': 3416, 'epoch': 3} {'type': 'loss', 'content': 0.36424243450164795, 'timestamp': '2025-09-05 09:15:13.992100', 'step': 3417, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:14.188052', 'step': 3417, 'epoch': 3} {'type': 'loss', 'content': 0.2606428861618042, 'timestamp': '2025-09-05 09:15:14.190080', 'step': 3418, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:14.355441', 'step': 3418, 'epoch': 3} {'type': 'loss', 'content': 0.335843950510025, 'timestamp': '2025-09-05 09:15:14.357952', 'step': 3419, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:14.563346', 'step': 3419, 'epoch': 3} {'type': 'loss', 'content': 0.3763747215270996, 'timestamp': '2025-09-05 09:15:14.577993', 'step': 3420, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:15:19.236496', 'step': 3420, 'epoch': 3} {'type': 'pplx', 'content': 56.271628643405734, 'timestamp': '2025-09-05 09:15:19.238464', 'step': 3420, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:19.398491', 'step': 3420, 'epoch': 3} {'type': 'loss', 'content': 0.26849332451820374, 'timestamp': '2025-09-05 09:15:19.400593', 'step': 3421, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:19.565877', 'step': 3421, 'epoch': 3} {'type': 'loss', 'content': 0.2555030584335327, 'timestamp': '2025-09-05 09:15:19.568109', 'step': 3422, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:19.770532', 'step': 3422, 'epoch': 3} {'type': 'loss', 'content': 0.3213971257209778, 'timestamp': '2025-09-05 09:15:19.772369', 'step': 3423, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:19.968537', 'step': 3423, 'epoch': 3} {'type': 'loss', 'content': 0.2824600338935852, 'timestamp': '2025-09-05 09:15:19.982086', 'step': 3424, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:15:20.169982', 'step': 3424, 'epoch': 3} {'type': 'loss', 'content': 0.2855144739151001, 'timestamp': '2025-09-05 09:15:20.172138', 'step': 3425, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:20.366577', 'step': 3425, 'epoch': 3} {'type': 'loss', 'content': 0.3020437955856323, 'timestamp': '2025-09-05 09:15:20.368634', 'step': 3426, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:15:20.564048', 'step': 3426, 'epoch': 3} {'type': 'loss', 'content': 0.26419728994369507, 'timestamp': '2025-09-05 09:15:20.566245', 'step': 3427, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:20.762084', 'step': 3427, 'epoch': 3} {'type': 'loss', 'content': 0.3464629054069519, 'timestamp': '2025-09-05 09:15:20.776139', 'step': 3428, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:20.971258', 'step': 3428, 'epoch': 3} {'type': 'loss', 'content': 0.2719259560108185, 'timestamp': '2025-09-05 09:15:20.973178', 'step': 3429, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:21.177875', 'step': 3429, 'epoch': 3} {'type': 'loss', 'content': 0.29615768790245056, 'timestamp': '2025-09-05 09:15:21.179698', 'step': 3430, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:21.374975', 'step': 3430, 'epoch': 3} {'type': 'loss', 'content': 0.1992407739162445, 'timestamp': '2025-09-05 09:15:21.377643', 'step': 3431, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:21.573803', 'step': 3431, 'epoch': 3} {'type': 'loss', 'content': 0.23724502325057983, 'timestamp': '2025-09-05 09:15:21.587778', 'step': 3432, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:21.774715', 'step': 3432, 'epoch': 3} {'type': 'loss', 'content': 0.3083495795726776, 'timestamp': '2025-09-05 09:15:21.776739', 'step': 3433, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:21.980420', 'step': 3433, 'epoch': 3} {'type': 'loss', 'content': 0.3451620638370514, 'timestamp': '2025-09-05 09:15:21.982583', 'step': 3434, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:22.179465', 'step': 3434, 'epoch': 3} {'type': 'loss', 'content': 0.3891802728176117, 'timestamp': '2025-09-05 09:15:22.181453', 'step': 3435, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:22.385405', 'step': 3435, 'epoch': 3} {'type': 'loss', 'content': 0.3739110231399536, 'timestamp': '2025-09-05 09:15:22.399825', 'step': 3436, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:22.595466', 'step': 3436, 'epoch': 3} {'type': 'loss', 'content': 0.2214849293231964, 'timestamp': '2025-09-05 09:15:22.600093', 'step': 3437, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:22.767477', 'step': 3437, 'epoch': 3} {'type': 'loss', 'content': 0.36019426584243774, 'timestamp': '2025-09-05 09:15:22.775452', 'step': 3438, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:22.982396', 'step': 3438, 'epoch': 3} {'type': 'loss', 'content': 0.1959642767906189, 'timestamp': '2025-09-05 09:15:22.984052', 'step': 3439, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:15:23.180164', 'step': 3439, 'epoch': 3} {'type': 'loss', 'content': 0.30671948194503784, 'timestamp': '2025-09-05 09:15:23.194420', 'step': 3440, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:15:27.842056', 'step': 3440, 'epoch': 3} {'type': 'pplx', 'content': 56.487056186244445, 'timestamp': '2025-09-05 09:15:27.844216', 'step': 3440, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3440', 'timestamp': '2025-09-05 09:15:28.308530', 'step': 3440, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:28.469196', 'step': 3440, 'epoch': 3} {'type': 'loss', 'content': 0.4308766722679138, 'timestamp': '2025-09-05 09:15:28.471395', 'step': 3441, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:28.672848', 'step': 3441, 'epoch': 3} {'type': 'loss', 'content': 0.2569759488105774, 'timestamp': '2025-09-05 09:15:28.675252', 'step': 3442, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:28.871485', 'step': 3442, 'epoch': 3} {'type': 'loss', 'content': 0.39308369159698486, 'timestamp': '2025-09-05 09:15:28.873593', 'step': 3443, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:29.067035', 'step': 3443, 'epoch': 3} {'type': 'loss', 'content': 0.36607518792152405, 'timestamp': '2025-09-05 09:15:29.080686', 'step': 3444, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:29.266327', 'step': 3444, 'epoch': 3} {'type': 'loss', 'content': 0.3157901167869568, 'timestamp': '2025-09-05 09:15:29.268664', 'step': 3445, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:15:29.463369', 'step': 3445, 'epoch': 3} {'type': 'loss', 'content': 0.32073408365249634, 'timestamp': '2025-09-05 09:15:29.465382', 'step': 3446, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:29.668771', 'step': 3446, 'epoch': 3} {'type': 'loss', 'content': 0.21521615982055664, 'timestamp': '2025-09-05 09:15:29.670848', 'step': 3447, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:29.873604', 'step': 3447, 'epoch': 3} {'type': 'loss', 'content': 0.24196557700634003, 'timestamp': '2025-09-05 09:15:29.887959', 'step': 3448, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:30.082517', 'step': 3448, 'epoch': 3} {'type': 'loss', 'content': 0.34385359287261963, 'timestamp': '2025-09-05 09:15:30.084676', 'step': 3449, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:30.279879', 'step': 3449, 'epoch': 3} {'type': 'loss', 'content': 0.2406640201807022, 'timestamp': '2025-09-05 09:15:30.282237', 'step': 3450, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:30.475842', 'step': 3450, 'epoch': 3} {'type': 'loss', 'content': 0.18529509007930756, 'timestamp': '2025-09-05 09:15:30.477976', 'step': 3451, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:30.672260', 'step': 3451, 'epoch': 3} {'type': 'loss', 'content': 0.3604893088340759, 'timestamp': '2025-09-05 09:15:30.688470', 'step': 3452, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:30.883472', 'step': 3452, 'epoch': 3} {'type': 'loss', 'content': 0.3433663249015808, 'timestamp': '2025-09-05 09:15:30.885759', 'step': 3453, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:31.079225', 'step': 3453, 'epoch': 3} {'type': 'loss', 'content': 0.2779248058795929, 'timestamp': '2025-09-05 09:15:31.082324', 'step': 3454, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:31.275564', 'step': 3454, 'epoch': 3} {'type': 'loss', 'content': 0.31973740458488464, 'timestamp': '2025-09-05 09:15:31.277984', 'step': 3455, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:31.471886', 'step': 3455, 'epoch': 3} {'type': 'loss', 'content': 0.349806547164917, 'timestamp': '2025-09-05 09:15:31.486066', 'step': 3456, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:15:31.676963', 'step': 3456, 'epoch': 3} {'type': 'loss', 'content': 0.1699691265821457, 'timestamp': '2025-09-05 09:15:31.678853', 'step': 3457, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:31.871846', 'step': 3457, 'epoch': 3} {'type': 'loss', 'content': 0.4177989065647125, 'timestamp': '2025-09-05 09:15:31.883300', 'step': 3458, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:32.078448', 'step': 3458, 'epoch': 3} {'type': 'loss', 'content': 0.24496307969093323, 'timestamp': '2025-09-05 09:15:32.080765', 'step': 3459, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:32.284260', 'step': 3459, 'epoch': 3} {'type': 'loss', 'content': 0.24864445626735687, 'timestamp': '2025-09-05 09:15:32.298058', 'step': 3460, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:15:36.949048', 'step': 3460, 'epoch': 3} {'type': 'pplx', 'content': 56.84112110557716, 'timestamp': '2025-09-05 09:15:36.951078', 'step': 3460, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:37.108299', 'step': 3460, 'epoch': 3} {'type': 'loss', 'content': 0.33684948086738586, 'timestamp': '2025-09-05 09:15:37.111450', 'step': 3461, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:37.276196', 'step': 3461, 'epoch': 3} {'type': 'loss', 'content': 0.371894896030426, 'timestamp': '2025-09-05 09:15:37.278182', 'step': 3462, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:37.481793', 'step': 3462, 'epoch': 3} {'type': 'loss', 'content': 0.41939064860343933, 'timestamp': '2025-09-05 09:15:37.483876', 'step': 3463, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:37.678809', 'step': 3463, 'epoch': 3} {'type': 'loss', 'content': 0.16646873950958252, 'timestamp': '2025-09-05 09:15:37.692365', 'step': 3464, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:37.887252', 'step': 3464, 'epoch': 3} {'type': 'loss', 'content': 0.27085840702056885, 'timestamp': '2025-09-05 09:15:37.889497', 'step': 3465, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:15:38.085222', 'step': 3465, 'epoch': 3} {'type': 'loss', 'content': 0.3408336639404297, 'timestamp': '2025-09-05 09:15:38.087272', 'step': 3466, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:38.288038', 'step': 3466, 'epoch': 3} {'type': 'loss', 'content': 0.23875372111797333, 'timestamp': '2025-09-05 09:15:38.290372', 'step': 3467, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:38.484252', 'step': 3467, 'epoch': 3} {'type': 'loss', 'content': 0.28153443336486816, 'timestamp': '2025-09-05 09:15:38.498227', 'step': 3468, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:38.686591', 'step': 3468, 'epoch': 3} {'type': 'loss', 'content': 0.3948226869106293, 'timestamp': '2025-09-05 09:15:38.689339', 'step': 3469, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:38.883108', 'step': 3469, 'epoch': 3} {'type': 'loss', 'content': 0.253915011882782, 'timestamp': '2025-09-05 09:15:38.885040', 'step': 3470, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:39.079939', 'step': 3470, 'epoch': 3} {'type': 'loss', 'content': 0.2193460613489151, 'timestamp': '2025-09-05 09:15:39.081811', 'step': 3471, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:39.276867', 'step': 3471, 'epoch': 3} {'type': 'loss', 'content': 0.19324208796024323, 'timestamp': '2025-09-05 09:15:39.291080', 'step': 3472, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:39.478061', 'step': 3472, 'epoch': 3} {'type': 'loss', 'content': 0.3039948344230652, 'timestamp': '2025-09-05 09:15:39.479952', 'step': 3473, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:39.675488', 'step': 3473, 'epoch': 3} {'type': 'loss', 'content': 0.36532217264175415, 'timestamp': '2025-09-05 09:15:39.677479', 'step': 3474, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:15:39.878087', 'step': 3474, 'epoch': 3} {'type': 'loss', 'content': 0.2959711253643036, 'timestamp': '2025-09-05 09:15:39.879967', 'step': 3475, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:40.075382', 'step': 3475, 'epoch': 3} {'type': 'loss', 'content': 0.44074317812919617, 'timestamp': '2025-09-05 09:15:40.091771', 'step': 3476, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:40.286274', 'step': 3476, 'epoch': 3} {'type': 'loss', 'content': 0.2943912446498871, 'timestamp': '2025-09-05 09:15:40.288213', 'step': 3477, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:15:40.490989', 'step': 3477, 'epoch': 3} {'type': 'loss', 'content': 0.21967966854572296, 'timestamp': '2025-09-05 09:15:40.492987', 'step': 3478, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:40.688577', 'step': 3478, 'epoch': 3} {'type': 'loss', 'content': 0.1738353669643402, 'timestamp': '2025-09-05 09:15:40.691098', 'step': 3479, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:40.894679', 'step': 3479, 'epoch': 3} {'type': 'loss', 'content': 0.3198263347148895, 'timestamp': '2025-09-05 09:15:40.908795', 'step': 3480, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:15:45.541745', 'step': 3480, 'epoch': 3} {'type': 'pplx', 'content': 57.259662380369484, 'timestamp': '2025-09-05 09:15:45.543529', 'step': 3480, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3480', 'timestamp': '2025-09-05 09:15:45.993651', 'step': 3480, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:15:46.154598', 'step': 3480, 'epoch': 3} {'type': 'loss', 'content': 0.30058109760284424, 'timestamp': '2025-09-05 09:15:46.156858', 'step': 3481, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:15:46.349969', 'step': 3481, 'epoch': 3} {'type': 'loss', 'content': 0.325216144323349, 'timestamp': '2025-09-05 09:15:46.351846', 'step': 3482, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:46.554582', 'step': 3482, 'epoch': 3} {'type': 'loss', 'content': 0.2780161499977112, 'timestamp': '2025-09-05 09:15:46.556875', 'step': 3483, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:46.760064', 'step': 3483, 'epoch': 3} {'type': 'loss', 'content': 0.1740078330039978, 'timestamp': '2025-09-05 09:15:46.768946', 'step': 3484, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:15:46.930918', 'step': 3484, 'epoch': 3} {'type': 'loss', 'content': 0.3463882803916931, 'timestamp': '2025-09-05 09:15:46.933011', 'step': 3485, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:47.137903', 'step': 3485, 'epoch': 3} {'type': 'loss', 'content': 0.1923578530550003, 'timestamp': '2025-09-05 09:15:47.140077', 'step': 3486, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:47.334418', 'step': 3486, 'epoch': 3} {'type': 'loss', 'content': 0.27616503834724426, 'timestamp': '2025-09-05 09:15:47.336586', 'step': 3487, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:15:47.541527', 'step': 3487, 'epoch': 3} {'type': 'loss', 'content': 0.475725382566452, 'timestamp': '2025-09-05 09:15:47.550637', 'step': 3488, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:15:47.714230', 'step': 3488, 'epoch': 3} {'type': 'loss', 'content': 0.42054125666618347, 'timestamp': '2025-09-05 09:15:47.716194', 'step': 3489, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:47.920059', 'step': 3489, 'epoch': 3} {'type': 'loss', 'content': 0.2236691415309906, 'timestamp': '2025-09-05 09:15:47.922360', 'step': 3490, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:48.117564', 'step': 3490, 'epoch': 3} {'type': 'loss', 'content': 0.21372617781162262, 'timestamp': '2025-09-05 09:15:48.119899', 'step': 3491, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:48.282364', 'step': 3491, 'epoch': 3} {'type': 'loss', 'content': 0.3831466734409332, 'timestamp': '2025-09-05 09:15:48.296563', 'step': 3492, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:48.480614', 'step': 3492, 'epoch': 3} {'type': 'loss', 'content': 0.4005196690559387, 'timestamp': '2025-09-05 09:15:48.482752', 'step': 3493, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:15:48.687366', 'step': 3493, 'epoch': 3} {'type': 'loss', 'content': 0.3020406663417816, 'timestamp': '2025-09-05 09:15:48.689928', 'step': 3494, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:48.882552', 'step': 3494, 'epoch': 3} {'type': 'loss', 'content': 0.33710694313049316, 'timestamp': '2025-09-05 09:15:48.884841', 'step': 3495, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:49.077262', 'step': 3495, 'epoch': 3} {'type': 'loss', 'content': 0.41189056634902954, 'timestamp': '2025-09-05 09:15:49.093452', 'step': 3496, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:49.285982', 'step': 3496, 'epoch': 3} {'type': 'loss', 'content': 0.3529796004295349, 'timestamp': '2025-09-05 09:15:49.288044', 'step': 3497, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:49.481074', 'step': 3497, 'epoch': 3} {'type': 'loss', 'content': 0.41452711820602417, 'timestamp': '2025-09-05 09:15:49.483836', 'step': 3498, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:49.685250', 'step': 3498, 'epoch': 3} {'type': 'loss', 'content': 0.23813626170158386, 'timestamp': '2025-09-05 09:15:49.688159', 'step': 3499, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:15:49.890585', 'step': 3499, 'epoch': 3} {'type': 'loss', 'content': 0.2140645980834961, 'timestamp': '2025-09-05 09:15:49.904427', 'step': 3500, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:15:54.532826', 'step': 3500, 'epoch': 3} {'type': 'pplx', 'content': 57.432321365212985, 'timestamp': '2025-09-05 09:15:54.535264', 'step': 3500, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:15:54.697969', 'step': 3500, 'epoch': 3} {'type': 'loss', 'content': 0.3301979899406433, 'timestamp': '2025-09-05 09:15:54.700718', 'step': 3501, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:15:54.868212', 'step': 3501, 'epoch': 3} {'type': 'loss', 'content': 0.25775179266929626, 'timestamp': '2025-09-05 09:15:54.870710', 'step': 3502, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:55.074541', 'step': 3502, 'epoch': 3} {'type': 'loss', 'content': 0.2093990296125412, 'timestamp': '2025-09-05 09:15:55.076740', 'step': 3503, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:55.270867', 'step': 3503, 'epoch': 3} {'type': 'loss', 'content': 0.23625101149082184, 'timestamp': '2025-09-05 09:15:55.286009', 'step': 3504, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:55.473909', 'step': 3504, 'epoch': 3} {'type': 'loss', 'content': 0.3592059016227722, 'timestamp': '2025-09-05 09:15:55.475922', 'step': 3505, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:55.679420', 'step': 3505, 'epoch': 3} {'type': 'loss', 'content': 0.2650335431098938, 'timestamp': '2025-09-05 09:15:55.681455', 'step': 3506, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:55.885416', 'step': 3506, 'epoch': 3} {'type': 'loss', 'content': 0.25605595111846924, 'timestamp': '2025-09-05 09:15:55.887447', 'step': 3507, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:56.082118', 'step': 3507, 'epoch': 3} {'type': 'loss', 'content': 0.2315322905778885, 'timestamp': '2025-09-05 09:15:56.098407', 'step': 3508, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:15:56.294219', 'step': 3508, 'epoch': 3} {'type': 'loss', 'content': 0.3174898028373718, 'timestamp': '2025-09-05 09:15:56.296213', 'step': 3509, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:56.499496', 'step': 3509, 'epoch': 3} {'type': 'loss', 'content': 0.3008250296115875, 'timestamp': '2025-09-05 09:15:56.501873', 'step': 3510, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:56.704909', 'step': 3510, 'epoch': 3} {'type': 'loss', 'content': 0.33064156770706177, 'timestamp': '2025-09-05 09:15:56.707016', 'step': 3511, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:15:56.902443', 'step': 3511, 'epoch': 3} {'type': 'loss', 'content': 0.2688453793525696, 'timestamp': '2025-09-05 09:15:56.911436', 'step': 3512, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:57.074020', 'step': 3512, 'epoch': 3} {'type': 'loss', 'content': 0.23117870092391968, 'timestamp': '2025-09-05 09:15:57.075969', 'step': 3513, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:15:57.280000', 'step': 3513, 'epoch': 3} {'type': 'loss', 'content': 0.17968448996543884, 'timestamp': '2025-09-05 09:15:57.281980', 'step': 3514, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:57.479408', 'step': 3514, 'epoch': 3} {'type': 'loss', 'content': 0.25697532296180725, 'timestamp': '2025-09-05 09:15:57.481853', 'step': 3515, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:57.685913', 'step': 3515, 'epoch': 3} {'type': 'loss', 'content': 0.29816049337387085, 'timestamp': '2025-09-05 09:15:57.698968', 'step': 3516, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:57.885343', 'step': 3516, 'epoch': 3} {'type': 'loss', 'content': 0.23597969114780426, 'timestamp': '2025-09-05 09:15:57.887857', 'step': 3517, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:15:58.093491', 'step': 3517, 'epoch': 3} {'type': 'loss', 'content': 0.26065388321876526, 'timestamp': '2025-09-05 09:15:58.095763', 'step': 3518, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:15:58.291219', 'step': 3518, 'epoch': 3} {'type': 'loss', 'content': 0.24879339337348938, 'timestamp': '2025-09-05 09:15:58.293254', 'step': 3519, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:15:58.489325', 'step': 3519, 'epoch': 3} {'type': 'loss', 'content': 0.2509574294090271, 'timestamp': '2025-09-05 09:15:58.502474', 'step': 3520, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:16:03.161672', 'step': 3520, 'epoch': 3} {'type': 'pplx', 'content': 57.248311386178514, 'timestamp': '2025-09-05 09:16:03.164871', 'step': 3520, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3520', 'timestamp': '2025-09-05 09:16:03.773257', 'step': 3520, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:16:03.938424', 'step': 3520, 'epoch': 3} {'type': 'loss', 'content': 0.24730339646339417, 'timestamp': '2025-09-05 09:16:03.940606', 'step': 3521, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:04.135104', 'step': 3521, 'epoch': 3} {'type': 'loss', 'content': 0.3603556752204895, 'timestamp': '2025-09-05 09:16:04.137174', 'step': 3522, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:04.332922', 'step': 3522, 'epoch': 3} {'type': 'loss', 'content': 0.27035149931907654, 'timestamp': '2025-09-05 09:16:04.335173', 'step': 3523, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:04.531971', 'step': 3523, 'epoch': 3} {'type': 'loss', 'content': 0.19072581827640533, 'timestamp': '2025-09-05 09:16:04.546028', 'step': 3524, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:04.732763', 'step': 3524, 'epoch': 3} {'type': 'loss', 'content': 0.4346585273742676, 'timestamp': '2025-09-05 09:16:04.735731', 'step': 3525, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:04.929976', 'step': 3525, 'epoch': 3} {'type': 'loss', 'content': 0.30274319648742676, 'timestamp': '2025-09-05 09:16:04.932300', 'step': 3526, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:05.128812', 'step': 3526, 'epoch': 3} {'type': 'loss', 'content': 0.2595900893211365, 'timestamp': '2025-09-05 09:16:05.130772', 'step': 3527, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:05.324734', 'step': 3527, 'epoch': 3} {'type': 'loss', 'content': 0.20915935933589935, 'timestamp': '2025-09-05 09:16:05.340930', 'step': 3528, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:05.536897', 'step': 3528, 'epoch': 3} {'type': 'loss', 'content': 0.26858463883399963, 'timestamp': '2025-09-05 09:16:05.539280', 'step': 3529, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:05.705787', 'step': 3529, 'epoch': 3} {'type': 'loss', 'content': 0.35047516226768494, 'timestamp': '2025-09-05 09:16:05.708011', 'step': 3530, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:05.911338', 'step': 3530, 'epoch': 3} {'type': 'loss', 'content': 0.2243708372116089, 'timestamp': '2025-09-05 09:16:05.913729', 'step': 3531, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:06.082348', 'step': 3531, 'epoch': 3} {'type': 'loss', 'content': 0.19937127828598022, 'timestamp': '2025-09-05 09:16:06.098218', 'step': 3532, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:06.294245', 'step': 3532, 'epoch': 3} {'type': 'loss', 'content': 0.17561663687229156, 'timestamp': '2025-09-05 09:16:06.296290', 'step': 3533, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:06.497951', 'step': 3533, 'epoch': 3} {'type': 'loss', 'content': 0.1659424901008606, 'timestamp': '2025-09-05 09:16:06.499835', 'step': 3534, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:16:06.696327', 'step': 3534, 'epoch': 3} {'type': 'loss', 'content': 0.28922516107559204, 'timestamp': '2025-09-05 09:16:06.699244', 'step': 3535, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:06.896587', 'step': 3535, 'epoch': 3} {'type': 'loss', 'content': 0.13516835868358612, 'timestamp': '2025-09-05 09:16:06.911478', 'step': 3536, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:07.110059', 'step': 3536, 'epoch': 3} {'type': 'loss', 'content': 0.217886820435524, 'timestamp': '2025-09-05 09:16:07.112524', 'step': 3537, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:07.308950', 'step': 3537, 'epoch': 3} {'type': 'loss', 'content': 0.2815990149974823, 'timestamp': '2025-09-05 09:16:07.310854', 'step': 3538, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:07.477125', 'step': 3538, 'epoch': 3} {'type': 'loss', 'content': 0.34248197078704834, 'timestamp': '2025-09-05 09:16:07.479196', 'step': 3539, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:07.683880', 'step': 3539, 'epoch': 3} {'type': 'loss', 'content': 0.35142749547958374, 'timestamp': '2025-09-05 09:16:07.700213', 'step': 3540, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:16:12.334278', 'step': 3540, 'epoch': 3} {'type': 'pplx', 'content': 57.18878524249908, 'timestamp': '2025-09-05 09:16:12.336402', 'step': 3540, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:12.495423', 'step': 3540, 'epoch': 3} {'type': 'loss', 'content': 0.2691498398780823, 'timestamp': '2025-09-05 09:16:12.497432', 'step': 3541, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:12.703100', 'step': 3541, 'epoch': 3} {'type': 'loss', 'content': 0.21435484290122986, 'timestamp': '2025-09-05 09:16:12.705330', 'step': 3542, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:12.901321', 'step': 3542, 'epoch': 3} {'type': 'loss', 'content': 0.257010281085968, 'timestamp': '2025-09-05 09:16:12.903276', 'step': 3543, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:13.107816', 'step': 3543, 'epoch': 3} {'type': 'loss', 'content': 0.31392380595207214, 'timestamp': '2025-09-05 09:16:13.121641', 'step': 3544, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:13.316878', 'step': 3544, 'epoch': 3} {'type': 'loss', 'content': 0.2716064751148224, 'timestamp': '2025-09-05 09:16:13.318958', 'step': 3545, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:13.483554', 'step': 3545, 'epoch': 3} {'type': 'loss', 'content': 0.33304086327552795, 'timestamp': '2025-09-05 09:16:13.486119', 'step': 3546, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:16:13.688428', 'step': 3546, 'epoch': 3} {'type': 'loss', 'content': 0.3453178405761719, 'timestamp': '2025-09-05 09:16:13.690938', 'step': 3547, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:13.890286', 'step': 3547, 'epoch': 3} {'type': 'loss', 'content': 0.3486884832382202, 'timestamp': '2025-09-05 09:16:13.907139', 'step': 3548, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:16:14.103328', 'step': 3548, 'epoch': 3} {'type': 'loss', 'content': 0.44889482855796814, 'timestamp': '2025-09-05 09:16:14.106268', 'step': 3549, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:14.302303', 'step': 3549, 'epoch': 3} {'type': 'loss', 'content': 0.2349972277879715, 'timestamp': '2025-09-05 09:16:14.304333', 'step': 3550, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:14.503881', 'step': 3550, 'epoch': 3} {'type': 'loss', 'content': 0.18555277585983276, 'timestamp': '2025-09-05 09:16:14.505559', 'step': 3551, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:14.701071', 'step': 3551, 'epoch': 3} {'type': 'loss', 'content': 0.34614723920822144, 'timestamp': '2025-09-05 09:16:14.714520', 'step': 3552, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:14.902023', 'step': 3552, 'epoch': 3} {'type': 'loss', 'content': 0.22294148802757263, 'timestamp': '2025-09-05 09:16:14.903759', 'step': 3553, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:15.099938', 'step': 3553, 'epoch': 3} {'type': 'loss', 'content': 0.3558078706264496, 'timestamp': '2025-09-05 09:16:15.102509', 'step': 3554, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:15.295938', 'step': 3554, 'epoch': 3} {'type': 'loss', 'content': 0.20311148464679718, 'timestamp': '2025-09-05 09:16:15.297998', 'step': 3555, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:15.492813', 'step': 3555, 'epoch': 3} {'type': 'loss', 'content': 0.2172691971063614, 'timestamp': '2025-09-05 09:16:15.508345', 'step': 3556, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:15.702178', 'step': 3556, 'epoch': 3} {'type': 'loss', 'content': 0.29311197996139526, 'timestamp': '2025-09-05 09:16:15.704350', 'step': 3557, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:15.907404', 'step': 3557, 'epoch': 3} {'type': 'loss', 'content': 0.31963905692100525, 'timestamp': '2025-09-05 09:16:15.909545', 'step': 3558, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:16.111802', 'step': 3558, 'epoch': 3} {'type': 'loss', 'content': 0.47033655643463135, 'timestamp': '2025-09-05 09:16:16.113736', 'step': 3559, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:16:16.307815', 'step': 3559, 'epoch': 3} {'type': 'loss', 'content': 0.34976720809936523, 'timestamp': '2025-09-05 09:16:16.321703', 'step': 3560, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:16:21.188059', 'step': 3560, 'epoch': 3} {'type': 'pplx', 'content': 57.505925013612185, 'timestamp': '2025-09-05 09:16:21.190112', 'step': 3560, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3560', 'timestamp': '2025-09-05 09:16:21.625686', 'step': 3560, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:21.786287', 'step': 3560, 'epoch': 3} {'type': 'loss', 'content': 0.33070307970046997, 'timestamp': '2025-09-05 09:16:21.788767', 'step': 3561, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:21.953471', 'step': 3561, 'epoch': 3} {'type': 'loss', 'content': 0.2531128227710724, 'timestamp': '2025-09-05 09:16:21.955557', 'step': 3562, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:22.148695', 'step': 3562, 'epoch': 3} {'type': 'loss', 'content': 0.3364905118942261, 'timestamp': '2025-09-05 09:16:22.150816', 'step': 3563, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:22.321957', 'step': 3563, 'epoch': 3} {'type': 'loss', 'content': 0.2518615424633026, 'timestamp': '2025-09-05 09:16:22.335784', 'step': 3564, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:22.489115', 'step': 3564, 'epoch': 3} {'type': 'loss', 'content': 0.35287201404571533, 'timestamp': '2025-09-05 09:16:22.491003', 'step': 3565, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:22.649868', 'step': 3565, 'epoch': 3} {'type': 'loss', 'content': 0.31937721371650696, 'timestamp': '2025-09-05 09:16:22.651695', 'step': 3566, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:22.821112', 'step': 3566, 'epoch': 3} {'type': 'loss', 'content': 0.329274445772171, 'timestamp': '2025-09-05 09:16:22.823418', 'step': 3567, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:16:22.981071', 'step': 3567, 'epoch': 3} {'type': 'loss', 'content': 0.23146797716617584, 'timestamp': '2025-09-05 09:16:22.994974', 'step': 3568, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:16:23.147220', 'step': 3568, 'epoch': 3} {'type': 'loss', 'content': 0.3371192514896393, 'timestamp': '2025-09-05 09:16:23.149264', 'step': 3569, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:23.309883', 'step': 3569, 'epoch': 3} {'type': 'loss', 'content': 0.30641764402389526, 'timestamp': '2025-09-05 09:16:23.312208', 'step': 3570, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:23.470545', 'step': 3570, 'epoch': 3} {'type': 'loss', 'content': 0.3931610882282257, 'timestamp': '2025-09-05 09:16:23.472232', 'step': 3571, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:23.628899', 'step': 3571, 'epoch': 3} {'type': 'loss', 'content': 0.2743593454360962, 'timestamp': '2025-09-05 09:16:23.642205', 'step': 3572, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:23.793306', 'step': 3572, 'epoch': 3} {'type': 'loss', 'content': 0.19696570932865143, 'timestamp': '2025-09-05 09:16:23.795320', 'step': 3573, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:23.964282', 'step': 3573, 'epoch': 3} {'type': 'loss', 'content': 0.31469157338142395, 'timestamp': '2025-09-05 09:16:23.966092', 'step': 3574, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:24.124842', 'step': 3574, 'epoch': 3} {'type': 'loss', 'content': 0.35265278816223145, 'timestamp': '2025-09-05 09:16:24.126761', 'step': 3575, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:16:24.285175', 'step': 3575, 'epoch': 3} {'type': 'loss', 'content': 0.2801409959793091, 'timestamp': '2025-09-05 09:16:24.299057', 'step': 3576, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:16:24.450733', 'step': 3576, 'epoch': 3} {'type': 'loss', 'content': 0.26633474230766296, 'timestamp': '2025-09-05 09:16:24.453513', 'step': 3577, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:24.610965', 'step': 3577, 'epoch': 3} {'type': 'loss', 'content': 0.3046843707561493, 'timestamp': '2025-09-05 09:16:24.612825', 'step': 3578, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:24.769922', 'step': 3578, 'epoch': 3} {'type': 'loss', 'content': 0.3721226453781128, 'timestamp': '2025-09-05 09:16:24.771766', 'step': 3579, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:16:24.931204', 'step': 3579, 'epoch': 3} {'type': 'loss', 'content': 0.264265239238739, 'timestamp': '2025-09-05 09:16:24.945012', 'step': 3580, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:16:29.567482', 'step': 3580, 'epoch': 3} {'type': 'pplx', 'content': 57.221559294521626, 'timestamp': '2025-09-05 09:16:29.569634', 'step': 3580, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:16:29.702774', 'step': 3580, 'epoch': 3} {'type': 'loss', 'content': 0.27857106924057007, 'timestamp': '2025-09-05 09:16:29.705099', 'step': 3581, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:29.841394', 'step': 3581, 'epoch': 3} {'type': 'loss', 'content': 0.35566446185112, 'timestamp': '2025-09-05 09:16:29.843601', 'step': 3582, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:30.016043', 'step': 3582, 'epoch': 3} {'type': 'loss', 'content': 0.2681717574596405, 'timestamp': '2025-09-05 09:16:30.018309', 'step': 3583, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:30.183556', 'step': 3583, 'epoch': 3} {'type': 'loss', 'content': 0.29908114671707153, 'timestamp': '2025-09-05 09:16:30.199639', 'step': 3584, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:30.363380', 'step': 3584, 'epoch': 3} {'type': 'loss', 'content': 0.32012149691581726, 'timestamp': '2025-09-05 09:16:30.365411', 'step': 3585, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:30.534700', 'step': 3585, 'epoch': 3} {'type': 'loss', 'content': 0.1617680937051773, 'timestamp': '2025-09-05 09:16:30.536749', 'step': 3586, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:30.711043', 'step': 3586, 'epoch': 3} {'type': 'loss', 'content': 0.18636199831962585, 'timestamp': '2025-09-05 09:16:30.713254', 'step': 3587, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:30.877559', 'step': 3587, 'epoch': 3} {'type': 'loss', 'content': 0.26028791069984436, 'timestamp': '2025-09-05 09:16:30.891716', 'step': 3588, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:31.048008', 'step': 3588, 'epoch': 3} {'type': 'loss', 'content': 0.23478567600250244, 'timestamp': '2025-09-05 09:16:31.050538', 'step': 3589, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:31.214481', 'step': 3589, 'epoch': 3} {'type': 'loss', 'content': 0.25303134322166443, 'timestamp': '2025-09-05 09:16:31.216538', 'step': 3590, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:16:31.353478', 'step': 3590, 'epoch': 3} {'type': 'loss', 'content': 0.3114836812019348, 'timestamp': '2025-09-05 09:16:31.355987', 'step': 3591, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:31.491787', 'step': 3591, 'epoch': 3} {'type': 'loss', 'content': 0.341848224401474, 'timestamp': '2025-09-05 09:16:31.507779', 'step': 3592, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:31.672999', 'step': 3592, 'epoch': 3} {'type': 'loss', 'content': 0.34669381380081177, 'timestamp': '2025-09-05 09:16:31.675020', 'step': 3593, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:31.839374', 'step': 3593, 'epoch': 3} {'type': 'loss', 'content': 0.24734577536582947, 'timestamp': '2025-09-05 09:16:31.841260', 'step': 3594, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:31.979773', 'step': 3594, 'epoch': 3} {'type': 'loss', 'content': 0.2852213978767395, 'timestamp': '2025-09-05 09:16:31.982039', 'step': 3595, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:32.154180', 'step': 3595, 'epoch': 3} {'type': 'loss', 'content': 0.20489144325256348, 'timestamp': '2025-09-05 09:16:32.168329', 'step': 3596, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:16:32.328468', 'step': 3596, 'epoch': 3} {'type': 'loss', 'content': 0.2507217824459076, 'timestamp': '2025-09-05 09:16:32.330751', 'step': 3597, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:16:32.494693', 'step': 3597, 'epoch': 3} {'type': 'loss', 'content': 0.29786399006843567, 'timestamp': '2025-09-05 09:16:32.496588', 'step': 3598, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:32.669476', 'step': 3598, 'epoch': 3} {'type': 'loss', 'content': 0.31531691551208496, 'timestamp': '2025-09-05 09:16:32.671583', 'step': 3599, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:32.845699', 'step': 3599, 'epoch': 3} {'type': 'loss', 'content': 0.28864872455596924, 'timestamp': '2025-09-05 09:16:32.862166', 'step': 3600, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:16:37.515636', 'step': 3600, 'epoch': 3} {'type': 'pplx', 'content': 56.849603424621414, 'timestamp': '2025-09-05 09:16:37.517453', 'step': 3600, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3600', 'timestamp': '2025-09-05 09:16:37.973586', 'step': 3600, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:16:38.113656', 'step': 3600, 'epoch': 3} {'type': 'loss', 'content': 0.3407289683818817, 'timestamp': '2025-09-05 09:16:38.115499', 'step': 3601, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:38.313738', 'step': 3601, 'epoch': 3} {'type': 'loss', 'content': 0.2807539999485016, 'timestamp': '2025-09-05 09:16:38.315728', 'step': 3602, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:38.481679', 'step': 3602, 'epoch': 3} {'type': 'loss', 'content': 0.2940472662448883, 'timestamp': '2025-09-05 09:16:38.483627', 'step': 3603, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:38.650368', 'step': 3603, 'epoch': 3} {'type': 'loss', 'content': 0.3419131934642792, 'timestamp': '2025-09-05 09:16:38.663614', 'step': 3604, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:38.821500', 'step': 3604, 'epoch': 3} {'type': 'loss', 'content': 0.2667962610721588, 'timestamp': '2025-09-05 09:16:38.824940', 'step': 3605, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:38.989583', 'step': 3605, 'epoch': 3} {'type': 'loss', 'content': 0.3813531696796417, 'timestamp': '2025-09-05 09:16:38.991762', 'step': 3606, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:39.156661', 'step': 3606, 'epoch': 3} {'type': 'loss', 'content': 0.21620894968509674, 'timestamp': '2025-09-05 09:16:39.158848', 'step': 3607, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:39.322673', 'step': 3607, 'epoch': 3} {'type': 'loss', 'content': 0.28690725564956665, 'timestamp': '2025-09-05 09:16:39.337856', 'step': 3608, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:39.504144', 'step': 3608, 'epoch': 3} {'type': 'loss', 'content': 0.2018442302942276, 'timestamp': '2025-09-05 09:16:39.506367', 'step': 3609, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:39.694573', 'step': 3609, 'epoch': 3} {'type': 'loss', 'content': 0.24246446788311005, 'timestamp': '2025-09-05 09:16:39.696675', 'step': 3610, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:39.903678', 'step': 3610, 'epoch': 3} {'type': 'loss', 'content': 0.2503988444805145, 'timestamp': '2025-09-05 09:16:39.905565', 'step': 3611, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:40.101897', 'step': 3611, 'epoch': 3} {'type': 'loss', 'content': 0.2636902928352356, 'timestamp': '2025-09-05 09:16:40.114992', 'step': 3612, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:40.309381', 'step': 3612, 'epoch': 3} {'type': 'loss', 'content': 0.2856784164905548, 'timestamp': '2025-09-05 09:16:40.311717', 'step': 3613, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:40.507200', 'step': 3613, 'epoch': 3} {'type': 'loss', 'content': 0.21770760416984558, 'timestamp': '2025-09-05 09:16:40.509303', 'step': 3614, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:16:40.712550', 'step': 3614, 'epoch': 3} {'type': 'loss', 'content': 0.3094342350959778, 'timestamp': '2025-09-05 09:16:40.714794', 'step': 3615, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:40.921171', 'step': 3615, 'epoch': 3} {'type': 'loss', 'content': 0.22260645031929016, 'timestamp': '2025-09-05 09:16:40.937490', 'step': 3616, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:41.132556', 'step': 3616, 'epoch': 3} {'type': 'loss', 'content': 0.2806592881679535, 'timestamp': '2025-09-05 09:16:41.134659', 'step': 3617, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:16:41.330202', 'step': 3617, 'epoch': 3} {'type': 'loss', 'content': 0.2558315098285675, 'timestamp': '2025-09-05 09:16:41.332341', 'step': 3618, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:41.535714', 'step': 3618, 'epoch': 3} {'type': 'loss', 'content': 0.29388701915740967, 'timestamp': '2025-09-05 09:16:41.538011', 'step': 3619, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:41.735923', 'step': 3619, 'epoch': 3} {'type': 'loss', 'content': 0.233632892370224, 'timestamp': '2025-09-05 09:16:41.750243', 'step': 3620, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:16:46.373597', 'step': 3620, 'epoch': 3} {'type': 'pplx', 'content': 56.65468023616325, 'timestamp': '2025-09-05 09:16:46.375753', 'step': 3620, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:46.536738', 'step': 3620, 'epoch': 3} {'type': 'loss', 'content': 0.3380981683731079, 'timestamp': '2025-09-05 09:16:46.539311', 'step': 3621, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:46.703329', 'step': 3621, 'epoch': 3} {'type': 'loss', 'content': 0.44205331802368164, 'timestamp': '2025-09-05 09:16:46.705667', 'step': 3622, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:46.911147', 'step': 3622, 'epoch': 3} {'type': 'loss', 'content': 0.22291947901248932, 'timestamp': '2025-09-05 09:16:46.913490', 'step': 3623, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:47.117238', 'step': 3623, 'epoch': 3} {'type': 'loss', 'content': 0.2371853142976761, 'timestamp': '2025-09-05 09:16:47.134683', 'step': 3624, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:47.325947', 'step': 3624, 'epoch': 3} {'type': 'loss', 'content': 0.3914877772331238, 'timestamp': '2025-09-05 09:16:47.327818', 'step': 3625, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:47.522207', 'step': 3625, 'epoch': 3} {'type': 'loss', 'content': 0.18587626516819, 'timestamp': '2025-09-05 09:16:47.524016', 'step': 3626, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:47.727008', 'step': 3626, 'epoch': 3} {'type': 'loss', 'content': 0.33849427103996277, 'timestamp': '2025-09-05 09:16:47.728996', 'step': 3627, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:47.932545', 'step': 3627, 'epoch': 3} {'type': 'loss', 'content': 0.30909058451652527, 'timestamp': '2025-09-05 09:16:47.948909', 'step': 3628, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:48.145519', 'step': 3628, 'epoch': 3} {'type': 'loss', 'content': 0.3659791350364685, 'timestamp': '2025-09-05 09:16:48.147519', 'step': 3629, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:48.352926', 'step': 3629, 'epoch': 3} {'type': 'loss', 'content': 0.33907654881477356, 'timestamp': '2025-09-05 09:16:48.354868', 'step': 3630, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:48.549234', 'step': 3630, 'epoch': 3} {'type': 'loss', 'content': 0.20109602808952332, 'timestamp': '2025-09-05 09:16:48.551245', 'step': 3631, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:48.756040', 'step': 3631, 'epoch': 3} {'type': 'loss', 'content': 0.20491234958171844, 'timestamp': '2025-09-05 09:16:48.769919', 'step': 3632, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:48.959793', 'step': 3632, 'epoch': 3} {'type': 'loss', 'content': 0.22076480090618134, 'timestamp': '2025-09-05 09:16:48.961632', 'step': 3633, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:49.156216', 'step': 3633, 'epoch': 3} {'type': 'loss', 'content': 0.3952508568763733, 'timestamp': '2025-09-05 09:16:49.158140', 'step': 3634, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:16:49.353843', 'step': 3634, 'epoch': 3} {'type': 'loss', 'content': 0.365893691778183, 'timestamp': '2025-09-05 09:16:49.356162', 'step': 3635, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:49.552509', 'step': 3635, 'epoch': 3} {'type': 'loss', 'content': 0.4604712426662445, 'timestamp': '2025-09-05 09:16:49.566332', 'step': 3636, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:49.753640', 'step': 3636, 'epoch': 3} {'type': 'loss', 'content': 0.2598891258239746, 'timestamp': '2025-09-05 09:16:49.755677', 'step': 3637, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:49.952092', 'step': 3637, 'epoch': 3} {'type': 'loss', 'content': 0.19562163949012756, 'timestamp': '2025-09-05 09:16:49.954362', 'step': 3638, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:16:50.150810', 'step': 3638, 'epoch': 3} {'type': 'loss', 'content': 0.3006066679954529, 'timestamp': '2025-09-05 09:16:50.152516', 'step': 3639, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:50.357244', 'step': 3639, 'epoch': 3} {'type': 'loss', 'content': 0.26009029150009155, 'timestamp': '2025-09-05 09:16:50.370461', 'step': 3640, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:16:55.002230', 'step': 3640, 'epoch': 3} {'type': 'pplx', 'content': 57.14356651280716, 'timestamp': '2025-09-05 09:16:55.003937', 'step': 3640, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3640', 'timestamp': '2025-09-05 09:16:55.470108', 'step': 3640, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:55.637860', 'step': 3640, 'epoch': 3} {'type': 'loss', 'content': 0.33125096559524536, 'timestamp': '2025-09-05 09:16:55.640180', 'step': 3641, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:55.834264', 'step': 3641, 'epoch': 3} {'type': 'loss', 'content': 0.29791468381881714, 'timestamp': '2025-09-05 09:16:55.836217', 'step': 3642, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:56.037864', 'step': 3642, 'epoch': 3} {'type': 'loss', 'content': 0.22799059748649597, 'timestamp': '2025-09-05 09:16:56.040517', 'step': 3643, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:56.237007', 'step': 3643, 'epoch': 3} {'type': 'loss', 'content': 0.19178558886051178, 'timestamp': '2025-09-05 09:16:56.251290', 'step': 3644, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:56.437268', 'step': 3644, 'epoch': 3} {'type': 'loss', 'content': 0.2333633154630661, 'timestamp': '2025-09-05 09:16:56.439316', 'step': 3645, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:16:56.641884', 'step': 3645, 'epoch': 3} {'type': 'loss', 'content': 0.3371458053588867, 'timestamp': '2025-09-05 09:16:56.643663', 'step': 3646, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:16:56.838438', 'step': 3646, 'epoch': 3} {'type': 'loss', 'content': 0.2189522385597229, 'timestamp': '2025-09-05 09:16:56.840576', 'step': 3647, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:57.036412', 'step': 3647, 'epoch': 3} {'type': 'loss', 'content': 0.20619438588619232, 'timestamp': '2025-09-05 09:16:57.052386', 'step': 3648, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:57.247348', 'step': 3648, 'epoch': 3} {'type': 'loss', 'content': 0.3777850568294525, 'timestamp': '2025-09-05 09:16:57.249655', 'step': 3649, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:57.453023', 'step': 3649, 'epoch': 3} {'type': 'loss', 'content': 0.19339697062969208, 'timestamp': '2025-09-05 09:16:57.455107', 'step': 3650, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:57.650129', 'step': 3650, 'epoch': 3} {'type': 'loss', 'content': 0.24509845674037933, 'timestamp': '2025-09-05 09:16:57.653034', 'step': 3651, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:16:57.851212', 'step': 3651, 'epoch': 3} {'type': 'loss', 'content': 0.20124991238117218, 'timestamp': '2025-09-05 09:16:57.865147', 'step': 3652, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:58.053115', 'step': 3652, 'epoch': 3} {'type': 'loss', 'content': 0.2905905246734619, 'timestamp': '2025-09-05 09:16:58.055367', 'step': 3653, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:16:58.259967', 'step': 3653, 'epoch': 3} {'type': 'loss', 'content': 0.3718425929546356, 'timestamp': '2025-09-05 09:16:58.262379', 'step': 3654, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:58.469732', 'step': 3654, 'epoch': 3} {'type': 'loss', 'content': 0.278747022151947, 'timestamp': '2025-09-05 09:16:58.472249', 'step': 3655, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:58.637485', 'step': 3655, 'epoch': 3} {'type': 'loss', 'content': 0.15937921404838562, 'timestamp': '2025-09-05 09:16:58.654440', 'step': 3656, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:58.853334', 'step': 3656, 'epoch': 3} {'type': 'loss', 'content': 0.25874295830726624, 'timestamp': '2025-09-05 09:16:58.855297', 'step': 3657, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:16:59.060556', 'step': 3657, 'epoch': 3} {'type': 'loss', 'content': 0.29497626423835754, 'timestamp': '2025-09-05 09:16:59.062998', 'step': 3658, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:16:59.260298', 'step': 3658, 'epoch': 3} {'type': 'loss', 'content': 0.31460994482040405, 'timestamp': '2025-09-05 09:16:59.262889', 'step': 3659, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:16:59.468470', 'step': 3659, 'epoch': 3} {'type': 'loss', 'content': 0.23476682603359222, 'timestamp': '2025-09-05 09:16:59.482345', 'step': 3660, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:17:04.179168', 'step': 3660, 'epoch': 3} {'type': 'pplx', 'content': 58.84672962747659, 'timestamp': '2025-09-05 09:17:04.183501', 'step': 3660, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:17:04.345280', 'step': 3660, 'epoch': 3} {'type': 'loss', 'content': 0.2967011034488678, 'timestamp': '2025-09-05 09:17:04.347457', 'step': 3661, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:04.486037', 'step': 3661, 'epoch': 3} {'type': 'loss', 'content': 0.20157547295093536, 'timestamp': '2025-09-05 09:17:04.491324', 'step': 3662, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:04.669365', 'step': 3662, 'epoch': 3} {'type': 'loss', 'content': 0.29696306586265564, 'timestamp': '2025-09-05 09:17:04.671800', 'step': 3663, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:04.836191', 'step': 3663, 'epoch': 3} {'type': 'loss', 'content': 0.31658288836479187, 'timestamp': '2025-09-05 09:17:04.845138', 'step': 3664, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:04.981361', 'step': 3664, 'epoch': 3} {'type': 'loss', 'content': 0.31298524141311646, 'timestamp': '2025-09-05 09:17:04.986225', 'step': 3665, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:05.204135', 'step': 3665, 'epoch': 3} {'type': 'loss', 'content': 0.24473224580287933, 'timestamp': '2025-09-05 09:17:05.207421', 'step': 3666, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:05.386416', 'step': 3666, 'epoch': 3} {'type': 'loss', 'content': 0.30507656931877136, 'timestamp': '2025-09-05 09:17:05.388406', 'step': 3667, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:05.562709', 'step': 3667, 'epoch': 3} {'type': 'loss', 'content': 0.3829200565814972, 'timestamp': '2025-09-05 09:17:05.572358', 'step': 3668, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:17:05.707364', 'step': 3668, 'epoch': 3} {'type': 'loss', 'content': 0.21303744614124298, 'timestamp': '2025-09-05 09:17:05.709248', 'step': 3669, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:05.873112', 'step': 3669, 'epoch': 3} {'type': 'loss', 'content': 0.19828033447265625, 'timestamp': '2025-09-05 09:17:05.875438', 'step': 3670, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:06.044585', 'step': 3670, 'epoch': 3} {'type': 'loss', 'content': 0.19999150931835175, 'timestamp': '2025-09-05 09:17:06.046676', 'step': 3671, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:17:06.210600', 'step': 3671, 'epoch': 3} {'type': 'loss', 'content': 0.20807354152202606, 'timestamp': '2025-09-05 09:17:06.224093', 'step': 3672, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:06.381194', 'step': 3672, 'epoch': 3} {'type': 'loss', 'content': 0.337863951921463, 'timestamp': '2025-09-05 09:17:06.383013', 'step': 3673, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:06.548655', 'step': 3673, 'epoch': 3} {'type': 'loss', 'content': 0.43848153948783875, 'timestamp': '2025-09-05 09:17:06.550812', 'step': 3674, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:06.726192', 'step': 3674, 'epoch': 3} {'type': 'loss', 'content': 0.3983319401741028, 'timestamp': '2025-09-05 09:17:06.728124', 'step': 3675, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:06.902251', 'step': 3675, 'epoch': 3} {'type': 'loss', 'content': 0.18885241448879242, 'timestamp': '2025-09-05 09:17:06.917409', 'step': 3676, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:07.095542', 'step': 3676, 'epoch': 3} {'type': 'loss', 'content': 0.22875548899173737, 'timestamp': '2025-09-05 09:17:07.097896', 'step': 3677, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:07.261387', 'step': 3677, 'epoch': 3} {'type': 'loss', 'content': 0.19291433691978455, 'timestamp': '2025-09-05 09:17:07.263485', 'step': 3678, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:07.436357', 'step': 3678, 'epoch': 3} {'type': 'loss', 'content': 0.2069120854139328, 'timestamp': '2025-09-05 09:17:07.438453', 'step': 3679, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:07.613336', 'step': 3679, 'epoch': 3} {'type': 'loss', 'content': 0.27292466163635254, 'timestamp': '2025-09-05 09:17:07.627569', 'step': 3680, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:17:12.442469', 'step': 3680, 'epoch': 3} {'type': 'pplx', 'content': 58.00292293677735, 'timestamp': '2025-09-05 09:17:12.444593', 'step': 3680, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3680', 'timestamp': '2025-09-05 09:17:12.923472', 'step': 3680, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:13.096349', 'step': 3680, 'epoch': 3} {'type': 'loss', 'content': 0.28774937987327576, 'timestamp': '2025-09-05 09:17:13.099635', 'step': 3681, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:13.270133', 'step': 3681, 'epoch': 3} {'type': 'loss', 'content': 0.29541826248168945, 'timestamp': '2025-09-05 09:17:13.272636', 'step': 3682, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:13.446220', 'step': 3682, 'epoch': 3} {'type': 'loss', 'content': 0.20348146557807922, 'timestamp': '2025-09-05 09:17:13.448403', 'step': 3683, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:13.621708', 'step': 3683, 'epoch': 3} {'type': 'loss', 'content': 0.27651840448379517, 'timestamp': '2025-09-05 09:17:13.637686', 'step': 3684, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:13.803773', 'step': 3684, 'epoch': 3} {'type': 'loss', 'content': 0.16801585257053375, 'timestamp': '2025-09-05 09:17:13.811331', 'step': 3685, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:13.977709', 'step': 3685, 'epoch': 3} {'type': 'loss', 'content': 0.27807697653770447, 'timestamp': '2025-09-05 09:17:13.990085', 'step': 3686, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:14.174891', 'step': 3686, 'epoch': 3} {'type': 'loss', 'content': 0.28929510712623596, 'timestamp': '2025-09-05 09:17:14.176862', 'step': 3687, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:14.342954', 'step': 3687, 'epoch': 3} {'type': 'loss', 'content': 0.33495303988456726, 'timestamp': '2025-09-05 09:17:14.359164', 'step': 3688, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:14.525050', 'step': 3688, 'epoch': 3} {'type': 'loss', 'content': 0.40975499153137207, 'timestamp': '2025-09-05 09:17:14.530323', 'step': 3689, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:17:14.707766', 'step': 3689, 'epoch': 3} {'type': 'loss', 'content': 0.34368452429771423, 'timestamp': '2025-09-05 09:17:14.711194', 'step': 3690, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:14.885111', 'step': 3690, 'epoch': 3} {'type': 'loss', 'content': 0.21246421337127686, 'timestamp': '2025-09-05 09:17:14.888437', 'step': 3691, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:15.063534', 'step': 3691, 'epoch': 3} {'type': 'loss', 'content': 0.17844977974891663, 'timestamp': '2025-09-05 09:17:15.079004', 'step': 3692, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:15.247480', 'step': 3692, 'epoch': 3} {'type': 'loss', 'content': 0.23904670774936676, 'timestamp': '2025-09-05 09:17:15.250098', 'step': 3693, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:17:15.419302', 'step': 3693, 'epoch': 3} {'type': 'loss', 'content': 0.2476682811975479, 'timestamp': '2025-09-05 09:17:15.421347', 'step': 3694, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:15.585971', 'step': 3694, 'epoch': 3} {'type': 'loss', 'content': 0.14145521819591522, 'timestamp': '2025-09-05 09:17:15.587978', 'step': 3695, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:15.755238', 'step': 3695, 'epoch': 3} {'type': 'loss', 'content': 0.2892580032348633, 'timestamp': '2025-09-05 09:17:15.769585', 'step': 3696, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:15.933810', 'step': 3696, 'epoch': 3} {'type': 'loss', 'content': 0.20448994636535645, 'timestamp': '2025-09-05 09:17:15.935738', 'step': 3697, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:16.110181', 'step': 3697, 'epoch': 3} {'type': 'loss', 'content': 0.30457159876823425, 'timestamp': '2025-09-05 09:17:16.112415', 'step': 3698, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:16.278273', 'step': 3698, 'epoch': 3} {'type': 'loss', 'content': 0.19800890982151031, 'timestamp': '2025-09-05 09:17:16.281130', 'step': 3699, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:16.447546', 'step': 3699, 'epoch': 3} {'type': 'loss', 'content': 0.10996751487255096, 'timestamp': '2025-09-05 09:17:16.464001', 'step': 3700, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:17:21.189675', 'step': 3700, 'epoch': 3} {'type': 'pplx', 'content': 56.65837507121739, 'timestamp': '2025-09-05 09:17:21.192210', 'step': 3700, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:21.325124', 'step': 3700, 'epoch': 3} {'type': 'loss', 'content': 0.35211700201034546, 'timestamp': '2025-09-05 09:17:21.328555', 'step': 3701, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:21.478549', 'step': 3701, 'epoch': 3} {'type': 'loss', 'content': 0.2543327510356903, 'timestamp': '2025-09-05 09:17:21.483201', 'step': 3702, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:21.661615', 'step': 3702, 'epoch': 3} {'type': 'loss', 'content': 0.10399866104125977, 'timestamp': '2025-09-05 09:17:21.663790', 'step': 3703, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:21.839507', 'step': 3703, 'epoch': 3} {'type': 'loss', 'content': 0.17878511548042297, 'timestamp': '2025-09-05 09:17:21.853699', 'step': 3704, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:22.023690', 'step': 3704, 'epoch': 3} {'type': 'loss', 'content': 0.25016236305236816, 'timestamp': '2025-09-05 09:17:22.026198', 'step': 3705, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:22.190767', 'step': 3705, 'epoch': 3} {'type': 'loss', 'content': 0.356041818857193, 'timestamp': '2025-09-05 09:17:22.192750', 'step': 3706, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:22.353308', 'step': 3706, 'epoch': 3} {'type': 'loss', 'content': 0.31788352131843567, 'timestamp': '2025-09-05 09:17:22.355417', 'step': 3707, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 5440033091648.0}, 'timestamp': '2025-09-05 09:17:22.513433', 'step': 3707, 'epoch': 3} {'type': 'loss', 'content': 0.4455195367336273, 'timestamp': '2025-09-05 09:17:22.529158', 'step': 3708, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:22.694433', 'step': 3708, 'epoch': 3} {'type': 'loss', 'content': 0.1478017121553421, 'timestamp': '2025-09-05 09:17:22.696732', 'step': 3709, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:22.865305', 'step': 3709, 'epoch': 3} {'type': 'loss', 'content': 0.30112531781196594, 'timestamp': '2025-09-05 09:17:22.867849', 'step': 3710, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:23.040859', 'step': 3710, 'epoch': 3} {'type': 'loss', 'content': 0.24474984407424927, 'timestamp': '2025-09-05 09:17:23.043598', 'step': 3711, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:23.202386', 'step': 3711, 'epoch': 3} {'type': 'loss', 'content': 0.2871285080909729, 'timestamp': '2025-09-05 09:17:23.217262', 'step': 3712, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:23.370485', 'step': 3712, 'epoch': 3} {'type': 'loss', 'content': 0.2523558437824249, 'timestamp': '2025-09-05 09:17:23.373047', 'step': 3713, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:23.533342', 'step': 3713, 'epoch': 3} {'type': 'loss', 'content': 0.1851450353860855, 'timestamp': '2025-09-05 09:17:23.535368', 'step': 3714, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:23.712360', 'step': 3714, 'epoch': 3} {'type': 'loss', 'content': 0.41676124930381775, 'timestamp': '2025-09-05 09:17:23.714662', 'step': 3715, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:23.886528', 'step': 3715, 'epoch': 3} {'type': 'loss', 'content': 0.28601813316345215, 'timestamp': '2025-09-05 09:17:23.903673', 'step': 3716, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:17:24.067942', 'step': 3716, 'epoch': 3} {'type': 'loss', 'content': 0.2414342612028122, 'timestamp': '2025-09-05 09:17:24.070264', 'step': 3717, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:24.236463', 'step': 3717, 'epoch': 3} {'type': 'loss', 'content': 0.28098100423812866, 'timestamp': '2025-09-05 09:17:24.239880', 'step': 3718, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:24.398819', 'step': 3718, 'epoch': 3} {'type': 'loss', 'content': 0.2182604819536209, 'timestamp': '2025-09-05 09:17:24.402058', 'step': 3719, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:24.569550', 'step': 3719, 'epoch': 3} {'type': 'loss', 'content': 0.21126191318035126, 'timestamp': '2025-09-05 09:17:24.587286', 'step': 3720, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:17:29.301300', 'step': 3720, 'epoch': 3} {'type': 'pplx', 'content': 57.851098754460935, 'timestamp': '2025-09-05 09:17:29.303386', 'step': 3720, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3720', 'timestamp': '2025-09-05 09:17:29.780448', 'step': 3720, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:17:29.918245', 'step': 3720, 'epoch': 3} {'type': 'loss', 'content': 0.17629235982894897, 'timestamp': '2025-09-05 09:17:29.920249', 'step': 3721, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:30.077145', 'step': 3721, 'epoch': 3} {'type': 'loss', 'content': 0.19920383393764496, 'timestamp': '2025-09-05 09:17:30.080108', 'step': 3722, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:30.257935', 'step': 3722, 'epoch': 3} {'type': 'loss', 'content': 0.1470199078321457, 'timestamp': '2025-09-05 09:17:30.260188', 'step': 3723, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:30.421805', 'step': 3723, 'epoch': 3} {'type': 'loss', 'content': 0.3522961735725403, 'timestamp': '2025-09-05 09:17:30.434868', 'step': 3724, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:30.588266', 'step': 3724, 'epoch': 3} {'type': 'loss', 'content': 0.22870679199695587, 'timestamp': '2025-09-05 09:17:30.590259', 'step': 3725, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:30.751751', 'step': 3725, 'epoch': 3} {'type': 'loss', 'content': 0.35698115825653076, 'timestamp': '2025-09-05 09:17:30.754837', 'step': 3726, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:17:30.914440', 'step': 3726, 'epoch': 3} {'type': 'loss', 'content': 0.2308841049671173, 'timestamp': '2025-09-05 09:17:30.916372', 'step': 3727, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:31.089362', 'step': 3727, 'epoch': 3} {'type': 'loss', 'content': 0.30304333567619324, 'timestamp': '2025-09-05 09:17:31.103816', 'step': 3728, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:31.258454', 'step': 3728, 'epoch': 3} {'type': 'loss', 'content': 0.4719480872154236, 'timestamp': '2025-09-05 09:17:31.261128', 'step': 3729, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:31.419266', 'step': 3729, 'epoch': 3} {'type': 'loss', 'content': 0.23239001631736755, 'timestamp': '2025-09-05 09:17:31.421208', 'step': 3730, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:31.594334', 'step': 3730, 'epoch': 3} {'type': 'loss', 'content': 0.25132450461387634, 'timestamp': '2025-09-05 09:17:31.596356', 'step': 3731, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:31.769554', 'step': 3731, 'epoch': 3} {'type': 'loss', 'content': 0.2377987802028656, 'timestamp': '2025-09-05 09:17:31.782682', 'step': 3732, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:31.935284', 'step': 3732, 'epoch': 3} {'type': 'loss', 'content': 0.21928219497203827, 'timestamp': '2025-09-05 09:17:31.937514', 'step': 3733, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:32.094333', 'step': 3733, 'epoch': 3} {'type': 'loss', 'content': 0.32776015996932983, 'timestamp': '2025-09-05 09:17:32.096980', 'step': 3734, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:32.257921', 'step': 3734, 'epoch': 3} {'type': 'loss', 'content': 0.3067801594734192, 'timestamp': '2025-09-05 09:17:32.260923', 'step': 3735, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:32.433004', 'step': 3735, 'epoch': 3} {'type': 'loss', 'content': 0.2571111023426056, 'timestamp': '2025-09-05 09:17:32.450589', 'step': 3736, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:32.617665', 'step': 3736, 'epoch': 3} {'type': 'loss', 'content': 0.31286391615867615, 'timestamp': '2025-09-05 09:17:32.621410', 'step': 3737, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:32.804127', 'step': 3737, 'epoch': 3} {'type': 'loss', 'content': 0.1266995221376419, 'timestamp': '2025-09-05 09:17:32.806894', 'step': 3738, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:32.989557', 'step': 3738, 'epoch': 3} {'type': 'loss', 'content': 0.18480810523033142, 'timestamp': '2025-09-05 09:17:32.993892', 'step': 3739, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:33.157154', 'step': 3739, 'epoch': 3} {'type': 'loss', 'content': 0.21677125990390778, 'timestamp': '2025-09-05 09:17:33.174230', 'step': 3740, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:17:37.915992', 'step': 3740, 'epoch': 3} {'type': 'pplx', 'content': 58.138925363076176, 'timestamp': '2025-09-05 09:17:37.919958', 'step': 3740, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:38.054986', 'step': 3740, 'epoch': 3} {'type': 'loss', 'content': 0.17171865701675415, 'timestamp': '2025-09-05 09:17:38.058251', 'step': 3741, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:38.194742', 'step': 3741, 'epoch': 3} {'type': 'loss', 'content': 0.11303048580884933, 'timestamp': '2025-09-05 09:17:38.196873', 'step': 3742, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:38.367087', 'step': 3742, 'epoch': 3} {'type': 'loss', 'content': 0.29843342304229736, 'timestamp': '2025-09-05 09:17:38.369146', 'step': 3743, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:38.528710', 'step': 3743, 'epoch': 3} {'type': 'loss', 'content': 0.28549709916114807, 'timestamp': '2025-09-05 09:17:38.545287', 'step': 3744, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:38.704412', 'step': 3744, 'epoch': 3} {'type': 'loss', 'content': 0.3105151653289795, 'timestamp': '2025-09-05 09:17:38.707561', 'step': 3745, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:38.870199', 'step': 3745, 'epoch': 3} {'type': 'loss', 'content': 0.2878493368625641, 'timestamp': '2025-09-05 09:17:38.874012', 'step': 3746, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:39.047543', 'step': 3746, 'epoch': 3} {'type': 'loss', 'content': 0.31547924876213074, 'timestamp': '2025-09-05 09:17:39.049871', 'step': 3747, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:17:39.220202', 'step': 3747, 'epoch': 3} {'type': 'loss', 'content': 0.4833051860332489, 'timestamp': '2025-09-05 09:17:39.234546', 'step': 3748, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:39.394896', 'step': 3748, 'epoch': 3} {'type': 'loss', 'content': 0.21690459549427032, 'timestamp': '2025-09-05 09:17:39.396991', 'step': 3749, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:39.567268', 'step': 3749, 'epoch': 3} {'type': 'loss', 'content': 0.24650557339191437, 'timestamp': '2025-09-05 09:17:39.572449', 'step': 3750, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:39.741704', 'step': 3750, 'epoch': 3} {'type': 'loss', 'content': 0.3864779770374298, 'timestamp': '2025-09-05 09:17:39.744322', 'step': 3751, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:39.929467', 'step': 3751, 'epoch': 3} {'type': 'loss', 'content': 0.271431028842926, 'timestamp': '2025-09-05 09:17:39.946728', 'step': 3752, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:40.129181', 'step': 3752, 'epoch': 3} {'type': 'loss', 'content': 0.2465740144252777, 'timestamp': '2025-09-05 09:17:40.131170', 'step': 3753, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:40.290242', 'step': 3753, 'epoch': 3} {'type': 'loss', 'content': 0.18188771605491638, 'timestamp': '2025-09-05 09:17:40.293833', 'step': 3754, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:40.453968', 'step': 3754, 'epoch': 3} {'type': 'loss', 'content': 0.3101077079772949, 'timestamp': '2025-09-05 09:17:40.465944', 'step': 3755, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:40.631092', 'step': 3755, 'epoch': 3} {'type': 'loss', 'content': 0.3483135402202606, 'timestamp': '2025-09-05 09:17:40.648326', 'step': 3756, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:40.816767', 'step': 3756, 'epoch': 3} {'type': 'loss', 'content': 0.3474515676498413, 'timestamp': '2025-09-05 09:17:40.820076', 'step': 3757, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:40.995965', 'step': 3757, 'epoch': 3} {'type': 'loss', 'content': 0.3206429183483124, 'timestamp': '2025-09-05 09:17:40.998901', 'step': 3758, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:41.157141', 'step': 3758, 'epoch': 3} {'type': 'loss', 'content': 0.3244212567806244, 'timestamp': '2025-09-05 09:17:41.159484', 'step': 3759, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:17:41.332000', 'step': 3759, 'epoch': 3} {'type': 'loss', 'content': 0.1970166712999344, 'timestamp': '2025-09-05 09:17:41.348675', 'step': 3760, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:17:46.106629', 'step': 3760, 'epoch': 3} {'type': 'pplx', 'content': 56.030742259845105, 'timestamp': '2025-09-05 09:17:46.109476', 'step': 3760, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3760', 'timestamp': '2025-09-05 09:17:46.628384', 'step': 3760, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:46.817598', 'step': 3760, 'epoch': 3} {'type': 'loss', 'content': 0.24627402424812317, 'timestamp': '2025-09-05 09:17:46.819504', 'step': 3761, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:47.015458', 'step': 3761, 'epoch': 3} {'type': 'loss', 'content': 0.4071979224681854, 'timestamp': '2025-09-05 09:17:47.017803', 'step': 3762, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:47.225117', 'step': 3762, 'epoch': 3} {'type': 'loss', 'content': 0.2320803850889206, 'timestamp': '2025-09-05 09:17:47.228767', 'step': 3763, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:47.431687', 'step': 3763, 'epoch': 3} {'type': 'loss', 'content': 0.2511028051376343, 'timestamp': '2025-09-05 09:17:47.447973', 'step': 3764, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:47.644725', 'step': 3764, 'epoch': 3} {'type': 'loss', 'content': 0.1637968271970749, 'timestamp': '2025-09-05 09:17:47.646815', 'step': 3765, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:47.855989', 'step': 3765, 'epoch': 3} {'type': 'loss', 'content': 0.4574233293533325, 'timestamp': '2025-09-05 09:17:47.858374', 'step': 3766, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:48.053732', 'step': 3766, 'epoch': 3} {'type': 'loss', 'content': 0.2990650236606598, 'timestamp': '2025-09-05 09:17:48.055922', 'step': 3767, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:48.251155', 'step': 3767, 'epoch': 3} {'type': 'loss', 'content': 0.34469056129455566, 'timestamp': '2025-09-05 09:17:48.264540', 'step': 3768, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:48.451902', 'step': 3768, 'epoch': 3} {'type': 'loss', 'content': 0.1231929287314415, 'timestamp': '2025-09-05 09:17:48.454219', 'step': 3769, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:48.650029', 'step': 3769, 'epoch': 3} {'type': 'loss', 'content': 0.22926165163516998, 'timestamp': '2025-09-05 09:17:48.652239', 'step': 3770, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:48.855961', 'step': 3770, 'epoch': 3} {'type': 'loss', 'content': 0.1999165564775467, 'timestamp': '2025-09-05 09:17:48.858063', 'step': 3771, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:49.053725', 'step': 3771, 'epoch': 3} {'type': 'loss', 'content': 0.29211559891700745, 'timestamp': '2025-09-05 09:17:49.068234', 'step': 3772, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:49.260693', 'step': 3772, 'epoch': 3} {'type': 'loss', 'content': 0.2825232446193695, 'timestamp': '2025-09-05 09:17:49.265059', 'step': 3773, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:49.468313', 'step': 3773, 'epoch': 3} {'type': 'loss', 'content': 0.2714124917984009, 'timestamp': '2025-09-05 09:17:49.473227', 'step': 3774, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:49.685717', 'step': 3774, 'epoch': 3} {'type': 'loss', 'content': 0.3337889611721039, 'timestamp': '2025-09-05 09:17:49.692216', 'step': 3775, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:49.897333', 'step': 3775, 'epoch': 3} {'type': 'loss', 'content': 0.28709539771080017, 'timestamp': '2025-09-05 09:17:49.911174', 'step': 3776, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:50.100013', 'step': 3776, 'epoch': 3} {'type': 'loss', 'content': 0.3871142268180847, 'timestamp': '2025-09-05 09:17:50.101949', 'step': 3777, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:50.294723', 'step': 3777, 'epoch': 3} {'type': 'loss', 'content': 0.41545554995536804, 'timestamp': '2025-09-05 09:17:50.296735', 'step': 3778, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:50.491618', 'step': 3778, 'epoch': 3} {'type': 'loss', 'content': 0.33116820454597473, 'timestamp': '2025-09-05 09:17:50.494035', 'step': 3779, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:50.689535', 'step': 3779, 'epoch': 3} {'type': 'loss', 'content': 0.3106461763381958, 'timestamp': '2025-09-05 09:17:50.705678', 'step': 3780, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:17:55.438519', 'step': 3780, 'epoch': 3} {'type': 'pplx', 'content': 54.86477109539171, 'timestamp': '2025-09-05 09:17:55.440640', 'step': 3780, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:55.602465', 'step': 3780, 'epoch': 3} {'type': 'loss', 'content': 0.2829877734184265, 'timestamp': '2025-09-05 09:17:55.604819', 'step': 3781, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:55.772017', 'step': 3781, 'epoch': 3} {'type': 'loss', 'content': 0.23081792891025543, 'timestamp': '2025-09-05 09:17:55.774253', 'step': 3782, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:55.978453', 'step': 3782, 'epoch': 3} {'type': 'loss', 'content': 0.2172662913799286, 'timestamp': '2025-09-05 09:17:55.980547', 'step': 3783, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:17:56.174995', 'step': 3783, 'epoch': 3} {'type': 'loss', 'content': 0.23292969167232513, 'timestamp': '2025-09-05 09:17:56.191374', 'step': 3784, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:56.399032', 'step': 3784, 'epoch': 3} {'type': 'loss', 'content': 0.28087952733039856, 'timestamp': '2025-09-05 09:17:56.404084', 'step': 3785, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:56.613247', 'step': 3785, 'epoch': 3} {'type': 'loss', 'content': 0.3004533052444458, 'timestamp': '2025-09-05 09:17:56.615302', 'step': 3786, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:17:56.811021', 'step': 3786, 'epoch': 3} {'type': 'loss', 'content': 0.22735357284545898, 'timestamp': '2025-09-05 09:17:56.813173', 'step': 3787, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:57.008692', 'step': 3787, 'epoch': 3} {'type': 'loss', 'content': 0.24109166860580444, 'timestamp': '2025-09-05 09:17:57.022332', 'step': 3788, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:57.210727', 'step': 3788, 'epoch': 3} {'type': 'loss', 'content': 0.25741469860076904, 'timestamp': '2025-09-05 09:17:57.212675', 'step': 3789, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:57.407077', 'step': 3789, 'epoch': 3} {'type': 'loss', 'content': 0.18140548467636108, 'timestamp': '2025-09-05 09:17:57.409333', 'step': 3790, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:57.612999', 'step': 3790, 'epoch': 3} {'type': 'loss', 'content': 0.2594696879386902, 'timestamp': '2025-09-05 09:17:57.615012', 'step': 3791, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:57.810111', 'step': 3791, 'epoch': 3} {'type': 'loss', 'content': 0.2717820703983307, 'timestamp': '2025-09-05 09:17:57.827484', 'step': 3792, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:17:58.038952', 'step': 3792, 'epoch': 3} {'type': 'loss', 'content': 0.2998480200767517, 'timestamp': '2025-09-05 09:17:58.041150', 'step': 3793, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:58.237041', 'step': 3793, 'epoch': 3} {'type': 'loss', 'content': 0.5077316164970398, 'timestamp': '2025-09-05 09:17:58.241772', 'step': 3794, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:17:58.436343', 'step': 3794, 'epoch': 3} {'type': 'loss', 'content': 0.22378866374492645, 'timestamp': '2025-09-05 09:17:58.438145', 'step': 3795, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:17:58.632479', 'step': 3795, 'epoch': 3} {'type': 'loss', 'content': 0.2865094840526581, 'timestamp': '2025-09-05 09:17:58.646480', 'step': 3796, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:58.834363', 'step': 3796, 'epoch': 3} {'type': 'loss', 'content': 0.2359636276960373, 'timestamp': '2025-09-05 09:17:58.836348', 'step': 3797, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:59.039984', 'step': 3797, 'epoch': 3} {'type': 'loss', 'content': 0.2948862612247467, 'timestamp': '2025-09-05 09:17:59.041770', 'step': 3798, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:17:59.237989', 'step': 3798, 'epoch': 3} {'type': 'loss', 'content': 0.2476307600736618, 'timestamp': '2025-09-05 09:17:59.243974', 'step': 3799, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:17:59.445175', 'step': 3799, 'epoch': 3} {'type': 'loss', 'content': 0.2329883575439453, 'timestamp': '2025-09-05 09:17:59.460504', 'step': 3800, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:18:04.091371', 'step': 3800, 'epoch': 3} {'type': 'pplx', 'content': 55.55614732154259, 'timestamp': '2025-09-05 09:18:04.093508', 'step': 3800, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3800', 'timestamp': '2025-09-05 09:18:04.825315', 'step': 3800, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:04.992856', 'step': 3800, 'epoch': 3} {'type': 'loss', 'content': 0.30155929923057556, 'timestamp': '2025-09-05 09:18:04.994966', 'step': 3801, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:05.158446', 'step': 3801, 'epoch': 3} {'type': 'loss', 'content': 0.41426223516464233, 'timestamp': '2025-09-05 09:18:05.164363', 'step': 3802, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:05.370875', 'step': 3802, 'epoch': 3} {'type': 'loss', 'content': 0.36889323592185974, 'timestamp': '2025-09-05 09:18:05.373209', 'step': 3803, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:05.569555', 'step': 3803, 'epoch': 3} {'type': 'loss', 'content': 0.2549019753932953, 'timestamp': '2025-09-05 09:18:05.585448', 'step': 3804, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:05.780737', 'step': 3804, 'epoch': 3} {'type': 'loss', 'content': 0.46478819847106934, 'timestamp': '2025-09-05 09:18:05.783228', 'step': 3805, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:05.977843', 'step': 3805, 'epoch': 3} {'type': 'loss', 'content': 0.2971610724925995, 'timestamp': '2025-09-05 09:18:05.979793', 'step': 3806, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:06.175088', 'step': 3806, 'epoch': 3} {'type': 'loss', 'content': 0.18687593936920166, 'timestamp': '2025-09-05 09:18:06.177352', 'step': 3807, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:06.380428', 'step': 3807, 'epoch': 3} {'type': 'loss', 'content': 0.3274900019168854, 'timestamp': '2025-09-05 09:18:06.394606', 'step': 3808, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:06.582600', 'step': 3808, 'epoch': 3} {'type': 'loss', 'content': 0.21774989366531372, 'timestamp': '2025-09-05 09:18:06.584933', 'step': 3809, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:18:06.780351', 'step': 3809, 'epoch': 3} {'type': 'loss', 'content': 0.25480151176452637, 'timestamp': '2025-09-05 09:18:06.783183', 'step': 3810, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:06.988610', 'step': 3810, 'epoch': 3} {'type': 'loss', 'content': 0.28949615359306335, 'timestamp': '2025-09-05 09:18:06.991180', 'step': 3811, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:18:07.197046', 'step': 3811, 'epoch': 3} {'type': 'loss', 'content': 0.3601587116718292, 'timestamp': '2025-09-05 09:18:07.213966', 'step': 3812, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:18:07.409722', 'step': 3812, 'epoch': 3} {'type': 'loss', 'content': 0.2533739507198334, 'timestamp': '2025-09-05 09:18:07.412817', 'step': 3813, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:18:07.617938', 'step': 3813, 'epoch': 3} {'type': 'loss', 'content': 0.3216201364994049, 'timestamp': '2025-09-05 09:18:07.620227', 'step': 3814, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:18:07.823930', 'step': 3814, 'epoch': 3} {'type': 'loss', 'content': 0.26064953207969666, 'timestamp': '2025-09-05 09:18:07.826145', 'step': 3815, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:08.029069', 'step': 3815, 'epoch': 3} {'type': 'loss', 'content': 0.4234212338924408, 'timestamp': '2025-09-05 09:18:08.045468', 'step': 3816, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:18:08.239569', 'step': 3816, 'epoch': 3} {'type': 'loss', 'content': 0.22362388670444489, 'timestamp': '2025-09-05 09:18:08.242035', 'step': 3817, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:08.444873', 'step': 3817, 'epoch': 3} {'type': 'loss', 'content': 0.3728334307670593, 'timestamp': '2025-09-05 09:18:08.447328', 'step': 3818, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:08.649347', 'step': 3818, 'epoch': 3} {'type': 'loss', 'content': 0.46954891085624695, 'timestamp': '2025-09-05 09:18:08.651921', 'step': 3819, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:18:08.843539', 'step': 3819, 'epoch': 3} {'type': 'loss', 'content': 0.2965394854545593, 'timestamp': '2025-09-05 09:18:08.860185', 'step': 3820, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:18:13.496780', 'step': 3820, 'epoch': 3} {'type': 'pplx', 'content': 55.5854559436734, 'timestamp': '2025-09-05 09:18:13.499381', 'step': 3820, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:13.660779', 'step': 3820, 'epoch': 3} {'type': 'loss', 'content': 0.22472511231899261, 'timestamp': '2025-09-05 09:18:13.664259', 'step': 3821, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:13.866930', 'step': 3821, 'epoch': 3} {'type': 'loss', 'content': 0.25364920496940613, 'timestamp': '2025-09-05 09:18:13.868953', 'step': 3822, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:14.062950', 'step': 3822, 'epoch': 3} {'type': 'loss', 'content': 0.21925756335258484, 'timestamp': '2025-09-05 09:18:14.065090', 'step': 3823, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:14.259796', 'step': 3823, 'epoch': 3} {'type': 'loss', 'content': 0.2519068419933319, 'timestamp': '2025-09-05 09:18:14.270679', 'step': 3824, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:14.439212', 'step': 3824, 'epoch': 3} {'type': 'loss', 'content': 0.3214259743690491, 'timestamp': '2025-09-05 09:18:14.441266', 'step': 3825, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:14.644776', 'step': 3825, 'epoch': 3} {'type': 'loss', 'content': 0.30687010288238525, 'timestamp': '2025-09-05 09:18:14.648046', 'step': 3826, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:18:14.844746', 'step': 3826, 'epoch': 3} {'type': 'loss', 'content': 0.2706732451915741, 'timestamp': '2025-09-05 09:18:14.846875', 'step': 3827, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 2560015608320.0}, 'timestamp': '2025-09-05 09:18:15.039578', 'step': 3827, 'epoch': 3} {'type': 'loss', 'content': 0.21124766767024994, 'timestamp': '2025-09-05 09:18:15.053944', 'step': 3828, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:15.240514', 'step': 3828, 'epoch': 3} {'type': 'loss', 'content': 0.2378242313861847, 'timestamp': '2025-09-05 09:18:15.242970', 'step': 3829, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:15.436793', 'step': 3829, 'epoch': 3} {'type': 'loss', 'content': 0.37138789892196655, 'timestamp': '2025-09-05 09:18:15.439225', 'step': 3830, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:15.636069', 'step': 3830, 'epoch': 3} {'type': 'loss', 'content': 0.48219579458236694, 'timestamp': '2025-09-05 09:18:15.639027', 'step': 3831, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:18:15.844375', 'step': 3831, 'epoch': 3} {'type': 'loss', 'content': 0.31807440519332886, 'timestamp': '2025-09-05 09:18:15.853360', 'step': 3832, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:16.014950', 'step': 3832, 'epoch': 3} {'type': 'loss', 'content': 0.19948077201843262, 'timestamp': '2025-09-05 09:18:16.016914', 'step': 3833, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:18:16.222462', 'step': 3833, 'epoch': 3} {'type': 'loss', 'content': 0.3451708257198334, 'timestamp': '2025-09-05 09:18:16.227796', 'step': 3834, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:16.432408', 'step': 3834, 'epoch': 3} {'type': 'loss', 'content': 0.1525815725326538, 'timestamp': '2025-09-05 09:18:16.434547', 'step': 3835, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:16.629223', 'step': 3835, 'epoch': 3} {'type': 'loss', 'content': 0.25581640005111694, 'timestamp': '2025-09-05 09:18:16.644719', 'step': 3836, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:18:16.841468', 'step': 3836, 'epoch': 3} {'type': 'loss', 'content': 0.1911073625087738, 'timestamp': '2025-09-05 09:18:16.843859', 'step': 3837, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:17.040110', 'step': 3837, 'epoch': 3} {'type': 'loss', 'content': 0.1686500757932663, 'timestamp': '2025-09-05 09:18:17.042298', 'step': 3838, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:17.248084', 'step': 3838, 'epoch': 3} {'type': 'loss', 'content': 0.15377473831176758, 'timestamp': '2025-09-05 09:18:17.250236', 'step': 3839, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:17.446870', 'step': 3839, 'epoch': 3} {'type': 'loss', 'content': 0.33481061458587646, 'timestamp': '2025-09-05 09:18:17.461064', 'step': 3840, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:18:22.080629', 'step': 3840, 'epoch': 3} {'type': 'pplx', 'content': 55.37722131907398, 'timestamp': '2025-09-05 09:18:22.082611', 'step': 3840, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3840', 'timestamp': '2025-09-05 09:18:22.537339', 'step': 3840, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:22.697793', 'step': 3840, 'epoch': 3} {'type': 'loss', 'content': 0.2810446321964264, 'timestamp': '2025-09-05 09:18:22.700095', 'step': 3841, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:22.893165', 'step': 3841, 'epoch': 3} {'type': 'loss', 'content': 0.14373518526554108, 'timestamp': '2025-09-05 09:18:22.895756', 'step': 3842, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:23.098992', 'step': 3842, 'epoch': 3} {'type': 'loss', 'content': 0.3351125717163086, 'timestamp': '2025-09-05 09:18:23.101228', 'step': 3843, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:18:23.296487', 'step': 3843, 'epoch': 3} {'type': 'loss', 'content': 0.3412880599498749, 'timestamp': '2025-09-05 09:18:23.310519', 'step': 3844, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:18:23.496121', 'step': 3844, 'epoch': 3} {'type': 'loss', 'content': 0.22351853549480438, 'timestamp': '2025-09-05 09:18:23.498529', 'step': 3845, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:23.692705', 'step': 3845, 'epoch': 3} {'type': 'loss', 'content': 0.31305429339408875, 'timestamp': '2025-09-05 09:18:23.695051', 'step': 3846, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:18:23.860711', 'step': 3846, 'epoch': 3} {'type': 'loss', 'content': 0.3345796763896942, 'timestamp': '2025-09-05 09:18:23.862968', 'step': 3847, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:24.066541', 'step': 3847, 'epoch': 3} {'type': 'loss', 'content': 0.2449566274881363, 'timestamp': '2025-09-05 09:18:24.081763', 'step': 3848, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:18:24.276787', 'step': 3848, 'epoch': 3} {'type': 'loss', 'content': 0.23441752791404724, 'timestamp': '2025-09-05 09:18:24.279105', 'step': 3849, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:18:24.474893', 'step': 3849, 'epoch': 3} {'type': 'loss', 'content': 0.2842091917991638, 'timestamp': '2025-09-05 09:18:24.477000', 'step': 3850, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:18:24.673146', 'step': 3850, 'epoch': 3} {'type': 'loss', 'content': 0.4027588963508606, 'timestamp': '2025-09-05 09:18:24.675318', 'step': 3851, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:24.871283', 'step': 3851, 'epoch': 3} {'type': 'loss', 'content': 0.36195552349090576, 'timestamp': '2025-09-05 09:18:24.885328', 'step': 3852, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:25.070758', 'step': 3852, 'epoch': 3} {'type': 'loss', 'content': 0.21417009830474854, 'timestamp': '2025-09-05 09:18:25.073293', 'step': 3853, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:25.268334', 'step': 3853, 'epoch': 3} {'type': 'loss', 'content': 0.21039460599422455, 'timestamp': '2025-09-05 09:18:25.270383', 'step': 3854, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:25.464573', 'step': 3854, 'epoch': 3} {'type': 'loss', 'content': 0.2987714409828186, 'timestamp': '2025-09-05 09:18:25.466698', 'step': 3855, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:18:25.628851', 'step': 3855, 'epoch': 3} {'type': 'loss', 'content': 0.3201431930065155, 'timestamp': '2025-09-05 09:18:25.645120', 'step': 3856, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:25.839423', 'step': 3856, 'epoch': 3} {'type': 'loss', 'content': 0.3816319704055786, 'timestamp': '2025-09-05 09:18:25.841608', 'step': 3857, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:18:26.035555', 'step': 3857, 'epoch': 3} {'type': 'loss', 'content': 0.29688769578933716, 'timestamp': '2025-09-05 09:18:26.037739', 'step': 3858, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:18:26.230567', 'step': 3858, 'epoch': 3} {'type': 'loss', 'content': 0.3592606484889984, 'timestamp': '2025-09-05 09:18:26.232683', 'step': 3859, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:26.426548', 'step': 3859, 'epoch': 3} {'type': 'loss', 'content': 0.3628304898738861, 'timestamp': '2025-09-05 09:18:26.440593', 'step': 3860, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:18:31.083837', 'step': 3860, 'epoch': 3} {'type': 'pplx', 'content': 56.521663876148416, 'timestamp': '2025-09-05 09:18:31.085627', 'step': 3860, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:18:31.245688', 'step': 3860, 'epoch': 3} {'type': 'loss', 'content': 0.2278020679950714, 'timestamp': '2025-09-05 09:18:31.248250', 'step': 3861, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:31.414379', 'step': 3861, 'epoch': 3} {'type': 'loss', 'content': 0.2553630471229553, 'timestamp': '2025-09-05 09:18:31.416714', 'step': 3862, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:31.622523', 'step': 3862, 'epoch': 3} {'type': 'loss', 'content': 0.31360796093940735, 'timestamp': '2025-09-05 09:18:31.625098', 'step': 3863, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:31.821089', 'step': 3863, 'epoch': 3} {'type': 'loss', 'content': 0.23280321061611176, 'timestamp': '2025-09-05 09:18:31.835268', 'step': 3864, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:32.022303', 'step': 3864, 'epoch': 3} {'type': 'loss', 'content': 0.2609652876853943, 'timestamp': '2025-09-05 09:18:32.024305', 'step': 3865, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:18:32.226513', 'step': 3865, 'epoch': 3} {'type': 'loss', 'content': 0.44476309418678284, 'timestamp': '2025-09-05 09:18:32.228929', 'step': 3866, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:18:32.422074', 'step': 3866, 'epoch': 3} {'type': 'loss', 'content': 0.1847713738679886, 'timestamp': '2025-09-05 09:18:32.424353', 'step': 3867, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:32.618727', 'step': 3867, 'epoch': 3} {'type': 'loss', 'content': 0.31068137288093567, 'timestamp': '2025-09-05 09:18:32.633213', 'step': 3868, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:32.820267', 'step': 3868, 'epoch': 3} {'type': 'loss', 'content': 0.2597085237503052, 'timestamp': '2025-09-05 09:18:32.822691', 'step': 3869, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:33.026080', 'step': 3869, 'epoch': 3} {'type': 'loss', 'content': 0.27988213300704956, 'timestamp': '2025-09-05 09:18:33.028025', 'step': 3870, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:33.222332', 'step': 3870, 'epoch': 3} {'type': 'loss', 'content': 0.28773733973503113, 'timestamp': '2025-09-05 09:18:33.224225', 'step': 3871, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:18:33.418058', 'step': 3871, 'epoch': 3} {'type': 'loss', 'content': 0.2838188707828522, 'timestamp': '2025-09-05 09:18:33.432683', 'step': 3872, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:33.620117', 'step': 3872, 'epoch': 3} {'type': 'loss', 'content': 0.31004562973976135, 'timestamp': '2025-09-05 09:18:33.622110', 'step': 3873, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:33.815513', 'step': 3873, 'epoch': 3} {'type': 'loss', 'content': 0.19978618621826172, 'timestamp': '2025-09-05 09:18:33.817791', 'step': 3874, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:34.021912', 'step': 3874, 'epoch': 3} {'type': 'loss', 'content': 0.18819376826286316, 'timestamp': '2025-09-05 09:18:34.024432', 'step': 3875, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:34.219846', 'step': 3875, 'epoch': 3} {'type': 'loss', 'content': 0.2756434381008148, 'timestamp': '2025-09-05 09:18:34.234092', 'step': 3876, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 09:18:34.423758', 'step': 3876, 'epoch': 3} {'type': 'loss', 'content': 0.22656309604644775, 'timestamp': '2025-09-05 09:18:34.426614', 'step': 3877, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:18:34.632024', 'step': 3877, 'epoch': 3} {'type': 'loss', 'content': 0.3541288375854492, 'timestamp': '2025-09-05 09:18:34.633953', 'step': 3878, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:34.798791', 'step': 3878, 'epoch': 3} {'type': 'loss', 'content': 0.1255226582288742, 'timestamp': '2025-09-05 09:18:34.800876', 'step': 3879, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:34.964815', 'step': 3879, 'epoch': 3} {'type': 'loss', 'content': 0.345575213432312, 'timestamp': '2025-09-05 09:18:34.981003', 'step': 3880, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:18:39.638308', 'step': 3880, 'epoch': 3} {'type': 'pplx', 'content': 57.5111446457821, 'timestamp': '2025-09-05 09:18:39.640566', 'step': 3880, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 3880', 'timestamp': '2025-09-05 09:18:40.303411', 'step': 3880, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:40.494156', 'step': 3880, 'epoch': 3} {'type': 'loss', 'content': 0.29227620363235474, 'timestamp': '2025-09-05 09:18:40.496215', 'step': 3881, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:18:40.708619', 'step': 3881, 'epoch': 3} {'type': 'loss', 'content': 0.30040618777275085, 'timestamp': '2025-09-05 09:18:40.710697', 'step': 3882, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:40.906165', 'step': 3882, 'epoch': 3} {'type': 'loss', 'content': 0.2703894376754761, 'timestamp': '2025-09-05 09:18:40.908114', 'step': 3883, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:18:41.104515', 'step': 3883, 'epoch': 3} {'type': 'loss', 'content': 0.344621866941452, 'timestamp': '2025-09-05 09:18:41.120488', 'step': 3884, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:41.315270', 'step': 3884, 'epoch': 3} {'type': 'loss', 'content': 0.17300912737846375, 'timestamp': '2025-09-05 09:18:41.318283', 'step': 3885, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 4480027263872.0}, 'timestamp': '2025-09-05 09:18:41.525061', 'step': 3885, 'epoch': 3} {'type': 'loss', 'content': 0.4111814796924591, 'timestamp': '2025-09-05 09:18:41.527036', 'step': 3886, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:41.724233', 'step': 3886, 'epoch': 3} {'type': 'loss', 'content': 0.32882702350616455, 'timestamp': '2025-09-05 09:18:41.726372', 'step': 3887, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 4800029206464.0}, 'timestamp': '2025-09-05 09:18:41.932520', 'step': 3887, 'epoch': 3} {'type': 'loss', 'content': 0.361674040555954, 'timestamp': '2025-09-05 09:18:41.941725', 'step': 3888, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:42.104101', 'step': 3888, 'epoch': 3} {'type': 'loss', 'content': 0.1684613674879074, 'timestamp': '2025-09-05 09:18:42.105741', 'step': 3889, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:42.309251', 'step': 3889, 'epoch': 3} {'type': 'loss', 'content': 0.20353275537490845, 'timestamp': '2025-09-05 09:18:42.311380', 'step': 3890, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:42.514218', 'step': 3890, 'epoch': 3} {'type': 'loss', 'content': 0.22002647817134857, 'timestamp': '2025-09-05 09:18:42.516313', 'step': 3891, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:42.711114', 'step': 3891, 'epoch': 3} {'type': 'loss', 'content': 0.3730429708957672, 'timestamp': '2025-09-05 09:18:42.724982', 'step': 3892, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 3200019493504.0}, 'timestamp': '2025-09-05 09:18:42.912985', 'step': 3892, 'epoch': 3} {'type': 'loss', 'content': 0.30563536286354065, 'timestamp': '2025-09-05 09:18:42.915080', 'step': 3893, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:43.110081', 'step': 3893, 'epoch': 3} {'type': 'loss', 'content': 0.3045671880245209, 'timestamp': '2025-09-05 09:18:43.112307', 'step': 3894, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 3840023378688.0}, 'timestamp': '2025-09-05 09:18:43.329512', 'step': 3894, 'epoch': 3} {'type': 'loss', 'content': 0.2163662165403366, 'timestamp': '2025-09-05 09:18:43.331571', 'step': 3895, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 4160025321280.0}, 'timestamp': '2025-09-05 09:18:43.529117', 'step': 3895, 'epoch': 3} {'type': 'loss', 'content': 0.2787623703479767, 'timestamp': '2025-09-05 09:18:43.537962', 'step': 3896, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:43.700965', 'step': 3896, 'epoch': 3} {'type': 'loss', 'content': 0.3941419720649719, 'timestamp': '2025-09-05 09:18:43.703207', 'step': 3897, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:18:43.867354', 'step': 3897, 'epoch': 3} {'type': 'loss', 'content': 0.22157038748264313, 'timestamp': '2025-09-05 09:18:43.869374', 'step': 3898, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 3520021436096.0}, 'timestamp': '2025-09-05 09:18:44.035260', 'step': 3898, 'epoch': 3} {'type': 'loss', 'content': 0.21150384843349457, 'timestamp': '2025-09-05 09:18:44.037093', 'step': 3899, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 2880017550912.0}, 'timestamp': '2025-09-05 09:18:44.202709', 'step': 3899, 'epoch': 3} {'type': 'loss', 'content': 0.14150860905647278, 'timestamp': '2025-09-05 09:18:44.211577', 'step': 3900, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:18:48.840423', 'step': 3900, 'epoch': 3} {'type': 'pplx', 'content': 57.68526548554968, 'timestamp': '2025-09-05 09:18:48.842740', 'step': 3900, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4151977605760}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3832594718208}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2235680280448}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3193828943104}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2555063168000}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2874446055552}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3513211830656}, {'type': 'perplexity', 'in_batch_dim': [2, 160], 'batch_size': 8, 'flops': 3193828943104}], 'timestamp': '2025-09-05 09:18:53.413113', 'step': 3900, 'epoch': 3} {'type': 'pplx', 'content': 57.68526548554968, 'timestamp': '2025-09-05 09:18:53.415162', 'step': 3900, 'epoch': 3} {'type': 'best_pplx', 'content': 52.40073254912315, 'timestamp': '2025-09-05 09:18:53.416944', 'step': 3900, 'epoch': 3} {'type': 'best_step', 'content': 3040, 'timestamp': '2025-09-05 09:18:53.418308', 'step': 3900, 'epoch': 3} {'type': 'total_pplx_flops', 'content': 49705559881469696, 'timestamp': '2025-09-05 09:18:53.420522', 'step': 3900, 'epoch': 3} {'type': 'total_train_flops', 'content': 1.368968336766336e+16, 'timestamp': '2025-09-05 09:18:53.781419', 'step': 3900, 'epoch': 3}