{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:10:53.095686', 'step': 0, 'epoch': 0} {'type': 'pplx', 'content': 226674977.87649825, 'timestamp': '2025-10-01 04:10:53.105798', 'step': 0, 'epoch': 0} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:53.194138', 'step': 0, 'epoch': 1} {'type': 'loss', 'content': 0.703412652015686, 'timestamp': '2025-10-01 04:10:53.200479', 'step': 1, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:53.270265', 'step': 1, 'epoch': 1} {'type': 'loss', 'content': 0.699732780456543, 'timestamp': '2025-10-01 04:10:53.285188', 'step': 2, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:53.347661', 'step': 2, 'epoch': 1} {'type': 'loss', 'content': 0.7366790175437927, 'timestamp': '2025-10-01 04:10:53.351307', 'step': 3, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:53.391890', 'step': 3, 'epoch': 1} {'type': 'loss', 'content': 0.7117161154747009, 'timestamp': '2025-10-01 04:10:53.461222', 'step': 4, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:53.507281', 'step': 4, 'epoch': 1} {'type': 'loss', 'content': 0.12175176292657852, 'timestamp': '2025-10-01 04:10:53.511748', 'step': 5, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:53.569934', 'step': 5, 'epoch': 1} {'type': 'loss', 'content': 0.11768395453691483, 'timestamp': '2025-10-01 04:10:53.575739', 'step': 6, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:53.619858', 'step': 6, 'epoch': 1} {'type': 'loss', 'content': 0.12094881385564804, 'timestamp': '2025-10-01 04:10:53.629033', 'step': 7, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:53.676006', 'step': 7, 'epoch': 1} {'type': 'loss', 'content': 0.13165441155433655, 'timestamp': '2025-10-01 04:10:53.704869', 'step': 8, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:53.746265', 'step': 8, 'epoch': 1} {'type': 'loss', 'content': 0.010522748343646526, 'timestamp': '2025-10-01 04:10:53.752986', 'step': 9, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:53.795388', 'step': 9, 'epoch': 1} {'type': 'loss', 'content': 0.042851757258176804, 'timestamp': '2025-10-01 04:10:53.800877', 'step': 10, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:53.843002', 'step': 10, 'epoch': 1} {'type': 'loss', 'content': 0.026877496391534805, 'timestamp': '2025-10-01 04:10:53.851789', 'step': 11, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:53.898927', 'step': 11, 'epoch': 1} {'type': 'loss', 'content': 0.01061374880373478, 'timestamp': '2025-10-01 04:10:53.926835', 'step': 12, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:53.974466', 'step': 12, 'epoch': 1} {'type': 'loss', 'content': 0.025151683017611504, 'timestamp': '2025-10-01 04:10:53.981624', 'step': 13, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:54.027921', 'step': 13, 'epoch': 1} {'type': 'loss', 'content': 0.06414594501256943, 'timestamp': '2025-10-01 04:10:54.034779', 'step': 14, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:54.083254', 'step': 14, 'epoch': 1} {'type': 'loss', 'content': 0.042276572436094284, 'timestamp': '2025-10-01 04:10:54.092531', 'step': 15, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:54.142288', 'step': 15, 'epoch': 1} {'type': 'loss', 'content': 0.024483708664774895, 'timestamp': '2025-10-01 04:10:54.173196', 'step': 16, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:54.226170', 'step': 16, 'epoch': 1} {'type': 'loss', 'content': 0.02734944596886635, 'timestamp': '2025-10-01 04:10:54.229034', 'step': 17, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:54.279877', 'step': 17, 'epoch': 1} {'type': 'loss', 'content': 0.02907527983188629, 'timestamp': '2025-10-01 04:10:54.285924', 'step': 18, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:54.332981', 'step': 18, 'epoch': 1} {'type': 'loss', 'content': 0.02434028685092926, 'timestamp': '2025-10-01 04:10:54.341022', 'step': 19, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:54.389533', 'step': 19, 'epoch': 1} {'type': 'loss', 'content': 0.026969805359840393, 'timestamp': '2025-10-01 04:10:54.418017', 'step': 20, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:54.466528', 'step': 20, 'epoch': 1} {'type': 'loss', 'content': 0.031657036393880844, 'timestamp': '2025-10-01 04:10:54.470219', 'step': 21, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:54.509071', 'step': 21, 'epoch': 1} {'type': 'loss', 'content': 0.03591819852590561, 'timestamp': '2025-10-01 04:10:54.516264', 'step': 22, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:54.567711', 'step': 22, 'epoch': 1} {'type': 'loss', 'content': 0.025713395327329636, 'timestamp': '2025-10-01 04:10:54.574780', 'step': 23, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:54.623030', 'step': 23, 'epoch': 1} {'type': 'loss', 'content': 0.024056894704699516, 'timestamp': '2025-10-01 04:10:54.653931', 'step': 24, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:54.699467', 'step': 24, 'epoch': 1} {'type': 'loss', 'content': 0.022549999877810478, 'timestamp': '2025-10-01 04:10:54.705025', 'step': 25, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:54.761120', 'step': 25, 'epoch': 1} {'type': 'loss', 'content': 0.028963249176740646, 'timestamp': '2025-10-01 04:10:54.769632', 'step': 26, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:54.813599', 'step': 26, 'epoch': 1} {'type': 'loss', 'content': 0.020357603207230568, 'timestamp': '2025-10-01 04:10:54.822712', 'step': 27, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:54.866029', 'step': 27, 'epoch': 1} {'type': 'loss', 'content': 0.022364825010299683, 'timestamp': '2025-10-01 04:10:54.895471', 'step': 28, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:54.946141', 'step': 28, 'epoch': 1} {'type': 'loss', 'content': 0.03066479228436947, 'timestamp': '2025-10-01 04:10:54.956692', 'step': 29, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:55.014996', 'step': 29, 'epoch': 1} {'type': 'loss', 'content': 0.021274466067552567, 'timestamp': '2025-10-01 04:10:55.026123', 'step': 30, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:55.087929', 'step': 30, 'epoch': 1} {'type': 'loss', 'content': 0.02097862772643566, 'timestamp': '2025-10-01 04:10:55.100150', 'step': 31, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:55.157840', 'step': 31, 'epoch': 1} {'type': 'loss', 'content': 0.0193604938685894, 'timestamp': '2025-10-01 04:10:55.189931', 'step': 32, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:55.238573', 'step': 32, 'epoch': 1} {'type': 'loss', 'content': 0.011159072630107403, 'timestamp': '2025-10-01 04:10:55.247628', 'step': 33, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:55.298064', 'step': 33, 'epoch': 1} {'type': 'loss', 'content': 0.029572388157248497, 'timestamp': '2025-10-01 04:10:55.306525', 'step': 34, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:55.350752', 'step': 34, 'epoch': 1} {'type': 'loss', 'content': 0.011771670542657375, 'timestamp': '2025-10-01 04:10:55.361310', 'step': 35, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:55.411118', 'step': 35, 'epoch': 1} {'type': 'loss', 'content': 0.018943196162581444, 'timestamp': '2025-10-01 04:10:55.441215', 'step': 36, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:55.488907', 'step': 36, 'epoch': 1} {'type': 'loss', 'content': 0.014170250855386257, 'timestamp': '2025-10-01 04:10:55.497674', 'step': 37, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:55.547353', 'step': 37, 'epoch': 1} {'type': 'loss', 'content': 0.0197011586278677, 'timestamp': '2025-10-01 04:10:55.555220', 'step': 38, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:55.598036', 'step': 38, 'epoch': 1} {'type': 'loss', 'content': 0.010018707253038883, 'timestamp': '2025-10-01 04:10:55.602924', 'step': 39, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:55.647184', 'step': 39, 'epoch': 1} {'type': 'loss', 'content': 0.019700050354003906, 'timestamp': '2025-10-01 04:10:55.674501', 'step': 40, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:55.720735', 'step': 40, 'epoch': 1} {'type': 'loss', 'content': 0.008056686259806156, 'timestamp': '2025-10-01 04:10:55.727517', 'step': 41, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:55.773450', 'step': 41, 'epoch': 1} {'type': 'loss', 'content': 0.01834237389266491, 'timestamp': '2025-10-01 04:10:55.783825', 'step': 42, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:55.826968', 'step': 42, 'epoch': 1} {'type': 'loss', 'content': 0.03287697210907936, 'timestamp': '2025-10-01 04:10:55.829460', 'step': 43, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:10:55.890294', 'step': 43, 'epoch': 1} {'type': 'loss', 'content': 0.007924867793917656, 'timestamp': '2025-10-01 04:10:55.914007', 'step': 44, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:55.944304', 'step': 44, 'epoch': 1} {'type': 'loss', 'content': 0.006307397969067097, 'timestamp': '2025-10-01 04:10:55.946373', 'step': 45, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:55.976313', 'step': 45, 'epoch': 1} {'type': 'loss', 'content': 0.005574687384068966, 'timestamp': '2025-10-01 04:10:55.978655', 'step': 46, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:56.008745', 'step': 46, 'epoch': 1} {'type': 'loss', 'content': 0.037656109780073166, 'timestamp': '2025-10-01 04:10:56.011071', 'step': 47, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:56.041579', 'step': 47, 'epoch': 1} {'type': 'loss', 'content': 0.021889669820666313, 'timestamp': '2025-10-01 04:10:56.065327', 'step': 48, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:56.096017', 'step': 48, 'epoch': 1} {'type': 'loss', 'content': 0.004702998790889978, 'timestamp': '2025-10-01 04:10:56.098076', 'step': 49, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:56.131661', 'step': 49, 'epoch': 1} {'type': 'loss', 'content': 0.03982829302549362, 'timestamp': '2025-10-01 04:10:56.134107', 'step': 50, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:56.164382', 'step': 50, 'epoch': 1} {'type': 'loss', 'content': 0.005432396661490202, 'timestamp': '2025-10-01 04:10:56.166655', 'step': 51, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:56.197372', 'step': 51, 'epoch': 1} {'type': 'loss', 'content': 0.03515584021806717, 'timestamp': '2025-10-01 04:10:56.221336', 'step': 52, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:56.252932', 'step': 52, 'epoch': 1} {'type': 'loss', 'content': 0.036965373903512955, 'timestamp': '2025-10-01 04:10:56.256656', 'step': 53, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:10:56.287467', 'step': 53, 'epoch': 1} {'type': 'loss', 'content': 0.03869860619306564, 'timestamp': '2025-10-01 04:10:56.289770', 'step': 54, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:56.319993', 'step': 54, 'epoch': 1} {'type': 'loss', 'content': 0.019588610157370567, 'timestamp': '2025-10-01 04:10:56.322701', 'step': 55, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:56.353408', 'step': 55, 'epoch': 1} {'type': 'loss', 'content': 0.039842940866947174, 'timestamp': '2025-10-01 04:10:56.377393', 'step': 56, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:56.408377', 'step': 56, 'epoch': 1} {'type': 'loss', 'content': 0.020486917346715927, 'timestamp': '2025-10-01 04:10:56.410470', 'step': 57, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:10:57.137405', 'step': 57, 'epoch': 1} {'type': 'pplx', 'content': 104877281.17432675, 'timestamp': '2025-10-01 04:10:57.139218', 'step': 57, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.169214', 'step': 57, 'epoch': 1} {'type': 'loss', 'content': 0.01975156180560589, 'timestamp': '2025-10-01 04:10:57.171568', 'step': 58, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.206987', 'step': 58, 'epoch': 1} {'type': 'loss', 'content': 0.018474208191037178, 'timestamp': '2025-10-01 04:10:57.209120', 'step': 59, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.240574', 'step': 59, 'epoch': 1} {'type': 'loss', 'content': 0.020764997228980064, 'timestamp': '2025-10-01 04:10:57.264650', 'step': 60, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.298954', 'step': 60, 'epoch': 1} {'type': 'loss', 'content': 0.019322749227285385, 'timestamp': '2025-10-01 04:10:57.301059', 'step': 61, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:57.331441', 'step': 61, 'epoch': 1} {'type': 'loss', 'content': 0.020046358928084373, 'timestamp': '2025-10-01 04:10:57.333636', 'step': 62, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:57.363712', 'step': 62, 'epoch': 1} {'type': 'loss', 'content': 0.01184836309403181, 'timestamp': '2025-10-01 04:10:57.366942', 'step': 63, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.398273', 'step': 63, 'epoch': 1} {'type': 'loss', 'content': 0.010952807031571865, 'timestamp': '2025-10-01 04:10:57.422021', 'step': 64, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.453477', 'step': 64, 'epoch': 1} {'type': 'loss', 'content': 0.013404452241957188, 'timestamp': '2025-10-01 04:10:57.455327', 'step': 65, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:57.486321', 'step': 65, 'epoch': 1} {'type': 'loss', 'content': 0.02560088224709034, 'timestamp': '2025-10-01 04:10:57.488536', 'step': 66, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.520393', 'step': 66, 'epoch': 1} {'type': 'loss', 'content': 0.01485830545425415, 'timestamp': '2025-10-01 04:10:57.522674', 'step': 67, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.555043', 'step': 67, 'epoch': 1} {'type': 'loss', 'content': 0.024088138714432716, 'timestamp': '2025-10-01 04:10:57.579039', 'step': 68, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:57.615446', 'step': 68, 'epoch': 1} {'type': 'loss', 'content': 0.026860293000936508, 'timestamp': '2025-10-01 04:10:57.617565', 'step': 69, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.648283', 'step': 69, 'epoch': 1} {'type': 'loss', 'content': 0.019698916003108025, 'timestamp': '2025-10-01 04:10:57.650365', 'step': 70, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.681342', 'step': 70, 'epoch': 1} {'type': 'loss', 'content': 0.016542484983801842, 'timestamp': '2025-10-01 04:10:57.683869', 'step': 71, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.715002', 'step': 71, 'epoch': 1} {'type': 'loss', 'content': 0.026610536500811577, 'timestamp': '2025-10-01 04:10:57.738746', 'step': 72, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:57.769910', 'step': 72, 'epoch': 1} {'type': 'loss', 'content': 0.02022835798561573, 'timestamp': '2025-10-01 04:10:57.772056', 'step': 73, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.804012', 'step': 73, 'epoch': 1} {'type': 'loss', 'content': 0.02639939822256565, 'timestamp': '2025-10-01 04:10:57.806345', 'step': 74, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.837218', 'step': 74, 'epoch': 1} {'type': 'loss', 'content': 0.020459027960896492, 'timestamp': '2025-10-01 04:10:57.839591', 'step': 75, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:10:57.869623', 'step': 75, 'epoch': 1} {'type': 'loss', 'content': 0.022298503667116165, 'timestamp': '2025-10-01 04:10:57.893644', 'step': 76, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.928200', 'step': 76, 'epoch': 1} {'type': 'loss', 'content': 0.02111753635108471, 'timestamp': '2025-10-01 04:10:57.930478', 'step': 77, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.961168', 'step': 77, 'epoch': 1} {'type': 'loss', 'content': 0.025581015273928642, 'timestamp': '2025-10-01 04:10:57.963498', 'step': 78, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:57.994194', 'step': 78, 'epoch': 1} {'type': 'loss', 'content': 0.024683058261871338, 'timestamp': '2025-10-01 04:10:57.996316', 'step': 79, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:58.030327', 'step': 79, 'epoch': 1} {'type': 'loss', 'content': 0.02450932003557682, 'timestamp': '2025-10-01 04:10:58.053881', 'step': 80, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.086434', 'step': 80, 'epoch': 1} {'type': 'loss', 'content': 0.02360459603369236, 'timestamp': '2025-10-01 04:10:58.088397', 'step': 81, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.119496', 'step': 81, 'epoch': 1} {'type': 'loss', 'content': 0.02305099368095398, 'timestamp': '2025-10-01 04:10:58.122557', 'step': 82, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.153531', 'step': 82, 'epoch': 1} {'type': 'loss', 'content': 0.033070627599954605, 'timestamp': '2025-10-01 04:10:58.156013', 'step': 83, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.190179', 'step': 83, 'epoch': 1} {'type': 'loss', 'content': 0.026605265215039253, 'timestamp': '2025-10-01 04:10:58.214044', 'step': 84, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.245854', 'step': 84, 'epoch': 1} {'type': 'loss', 'content': 0.023719897493720055, 'timestamp': '2025-10-01 04:10:58.248348', 'step': 85, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.279713', 'step': 85, 'epoch': 1} {'type': 'loss', 'content': 0.016296805813908577, 'timestamp': '2025-10-01 04:10:58.282101', 'step': 86, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.313234', 'step': 86, 'epoch': 1} {'type': 'loss', 'content': 0.019035162404179573, 'timestamp': '2025-10-01 04:10:58.315368', 'step': 87, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.346123', 'step': 87, 'epoch': 1} {'type': 'loss', 'content': 0.018415704369544983, 'timestamp': '2025-10-01 04:10:58.369870', 'step': 88, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.408527', 'step': 88, 'epoch': 1} {'type': 'loss', 'content': 0.01595030352473259, 'timestamp': '2025-10-01 04:10:58.411022', 'step': 89, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:58.442729', 'step': 89, 'epoch': 1} {'type': 'loss', 'content': 0.02289186790585518, 'timestamp': '2025-10-01 04:10:58.444877', 'step': 90, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.475873', 'step': 90, 'epoch': 1} {'type': 'loss', 'content': 0.02066519856452942, 'timestamp': '2025-10-01 04:10:58.477833', 'step': 91, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:10:58.508103', 'step': 91, 'epoch': 1} {'type': 'loss', 'content': 0.02692834846675396, 'timestamp': '2025-10-01 04:10:58.531867', 'step': 92, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.562945', 'step': 92, 'epoch': 1} {'type': 'loss', 'content': 0.016894636675715446, 'timestamp': '2025-10-01 04:10:58.565180', 'step': 93, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.596827', 'step': 93, 'epoch': 1} {'type': 'loss', 'content': 0.02401311881840229, 'timestamp': '2025-10-01 04:10:58.598918', 'step': 94, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.629500', 'step': 94, 'epoch': 1} {'type': 'loss', 'content': 0.020219998434185982, 'timestamp': '2025-10-01 04:10:58.631735', 'step': 95, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.661362', 'step': 95, 'epoch': 1} {'type': 'loss', 'content': 0.02421734668314457, 'timestamp': '2025-10-01 04:10:58.685139', 'step': 96, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.717069', 'step': 96, 'epoch': 1} {'type': 'loss', 'content': 0.019959433004260063, 'timestamp': '2025-10-01 04:10:58.719484', 'step': 97, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.750834', 'step': 97, 'epoch': 1} {'type': 'loss', 'content': 0.021867400035262108, 'timestamp': '2025-10-01 04:10:58.753140', 'step': 98, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.784119', 'step': 98, 'epoch': 1} {'type': 'loss', 'content': 0.017920518293976784, 'timestamp': '2025-10-01 04:10:58.786535', 'step': 99, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.817171', 'step': 99, 'epoch': 1} {'type': 'loss', 'content': 0.020041782408952713, 'timestamp': '2025-10-01 04:10:58.840975', 'step': 100, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.872564', 'step': 100, 'epoch': 1} {'type': 'loss', 'content': 0.018996138125658035, 'timestamp': '2025-10-01 04:10:58.874761', 'step': 101, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.905882', 'step': 101, 'epoch': 1} {'type': 'loss', 'content': 0.01724168471992016, 'timestamp': '2025-10-01 04:10:58.907742', 'step': 102, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.939080', 'step': 102, 'epoch': 1} {'type': 'loss', 'content': 0.020902881398797035, 'timestamp': '2025-10-01 04:10:58.941101', 'step': 103, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:58.971484', 'step': 103, 'epoch': 1} {'type': 'loss', 'content': 0.027707403525710106, 'timestamp': '2025-10-01 04:10:58.995345', 'step': 104, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:59.026821', 'step': 104, 'epoch': 1} {'type': 'loss', 'content': 0.03331645205616951, 'timestamp': '2025-10-01 04:10:59.029874', 'step': 105, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:59.065983', 'step': 105, 'epoch': 1} {'type': 'loss', 'content': 0.009206474758684635, 'timestamp': '2025-10-01 04:10:59.068593', 'step': 106, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:59.099413', 'step': 106, 'epoch': 1} {'type': 'loss', 'content': 0.01976872608065605, 'timestamp': '2025-10-01 04:10:59.101505', 'step': 107, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:59.132651', 'step': 107, 'epoch': 1} {'type': 'loss', 'content': 0.04233280569314957, 'timestamp': '2025-10-01 04:10:59.156597', 'step': 108, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:59.188553', 'step': 108, 'epoch': 1} {'type': 'loss', 'content': 0.04243811219930649, 'timestamp': '2025-10-01 04:10:59.190853', 'step': 109, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:10:59.231560', 'step': 109, 'epoch': 1} {'type': 'loss', 'content': 0.03881783038377762, 'timestamp': '2025-10-01 04:10:59.234032', 'step': 110, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:59.264775', 'step': 110, 'epoch': 1} {'type': 'loss', 'content': 0.008653001859784126, 'timestamp': '2025-10-01 04:10:59.267086', 'step': 111, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:59.297628', 'step': 111, 'epoch': 1} {'type': 'loss', 'content': 0.027102604508399963, 'timestamp': '2025-10-01 04:10:59.321517', 'step': 112, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:10:59.358722', 'step': 112, 'epoch': 1} {'type': 'loss', 'content': 0.029516849666833878, 'timestamp': '2025-10-01 04:10:59.373841', 'step': 113, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:10:59.438262', 'step': 113, 'epoch': 1} {'type': 'loss', 'content': 0.021250639110803604, 'timestamp': '2025-10-01 04:10:59.443863', 'step': 114, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:11:00.689761', 'step': 114, 'epoch': 1} {'type': 'pplx', 'content': 119027336.35998133, 'timestamp': '2025-10-01 04:11:00.691802', 'step': 114, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:00.721252', 'step': 114, 'epoch': 1} {'type': 'loss', 'content': 0.010005722753703594, 'timestamp': '2025-10-01 04:11:00.723303', 'step': 115, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:00.754543', 'step': 115, 'epoch': 1} {'type': 'loss', 'content': 0.0302440095692873, 'timestamp': '2025-10-01 04:11:00.778465', 'step': 116, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:00.811694', 'step': 116, 'epoch': 1} {'type': 'loss', 'content': 0.021053824573755264, 'timestamp': '2025-10-01 04:11:00.813938', 'step': 117, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:00.844816', 'step': 117, 'epoch': 1} {'type': 'loss', 'content': 0.019699398428201675, 'timestamp': '2025-10-01 04:11:00.846787', 'step': 118, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:00.876873', 'step': 118, 'epoch': 1} {'type': 'loss', 'content': 0.018002357333898544, 'timestamp': '2025-10-01 04:11:00.879564', 'step': 119, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:00.910805', 'step': 119, 'epoch': 1} {'type': 'loss', 'content': 0.02030670829117298, 'timestamp': '2025-10-01 04:11:00.934825', 'step': 120, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:00.967310', 'step': 120, 'epoch': 1} {'type': 'loss', 'content': 0.030375460162758827, 'timestamp': '2025-10-01 04:11:00.969369', 'step': 121, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:00.999999', 'step': 121, 'epoch': 1} {'type': 'loss', 'content': 0.021546974778175354, 'timestamp': '2025-10-01 04:11:01.002224', 'step': 122, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:01.033058', 'step': 122, 'epoch': 1} {'type': 'loss', 'content': 0.022841254249215126, 'timestamp': '2025-10-01 04:11:01.035390', 'step': 123, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:01.066973', 'step': 123, 'epoch': 1} {'type': 'loss', 'content': 0.017218273133039474, 'timestamp': '2025-10-01 04:11:01.090983', 'step': 124, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:01.123155', 'step': 124, 'epoch': 1} {'type': 'loss', 'content': 0.019611822441220284, 'timestamp': '2025-10-01 04:11:01.126793', 'step': 125, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:01.162107', 'step': 125, 'epoch': 1} {'type': 'loss', 'content': 0.030845703557133675, 'timestamp': '2025-10-01 04:11:01.164762', 'step': 126, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:01.200174', 'step': 126, 'epoch': 1} {'type': 'loss', 'content': 0.020298536866903305, 'timestamp': '2025-10-01 04:11:01.203233', 'step': 127, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:01.237161', 'step': 127, 'epoch': 1} {'type': 'loss', 'content': 0.020675739273428917, 'timestamp': '2025-10-01 04:11:01.261686', 'step': 128, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:01.293347', 'step': 128, 'epoch': 1} {'type': 'loss', 'content': 0.02181154489517212, 'timestamp': '2025-10-01 04:11:01.295522', 'step': 129, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:01.326822', 'step': 129, 'epoch': 1} {'type': 'loss', 'content': 0.025619324296712875, 'timestamp': '2025-10-01 04:11:01.329057', 'step': 130, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:01.360457', 'step': 130, 'epoch': 1} {'type': 'loss', 'content': 0.028702473267912865, 'timestamp': '2025-10-01 04:11:01.362897', 'step': 131, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:01.394496', 'step': 131, 'epoch': 1} {'type': 'loss', 'content': 0.020739736035466194, 'timestamp': '2025-10-01 04:11:01.418233', 'step': 132, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:01.450579', 'step': 132, 'epoch': 1} {'type': 'loss', 'content': 0.01721821539103985, 'timestamp': '2025-10-01 04:11:01.452685', 'step': 133, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:01.483621', 'step': 133, 'epoch': 1} {'type': 'loss', 'content': 0.021848777309060097, 'timestamp': '2025-10-01 04:11:01.485842', 'step': 134, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:01.517014', 'step': 134, 'epoch': 1} {'type': 'loss', 'content': 0.02322843298316002, 'timestamp': '2025-10-01 04:11:01.519106', 'step': 135, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:01.551092', 'step': 135, 'epoch': 1} {'type': 'loss', 'content': 0.020518863573670387, 'timestamp': '2025-10-01 04:11:01.574846', 'step': 136, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:01.606313', 'step': 136, 'epoch': 1} {'type': 'loss', 'content': 0.020729046314954758, 'timestamp': '2025-10-01 04:11:01.608280', 'step': 137, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:01.639677', 'step': 137, 'epoch': 1} {'type': 'loss', 'content': 0.02201726660132408, 'timestamp': '2025-10-01 04:11:01.642066', 'step': 138, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:01.672490', 'step': 138, 'epoch': 1} {'type': 'loss', 'content': 0.01811814494431019, 'timestamp': '2025-10-01 04:11:01.674619', 'step': 139, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:01.705919', 'step': 139, 'epoch': 1} {'type': 'loss', 'content': 0.0218853447586298, 'timestamp': '2025-10-01 04:11:01.731068', 'step': 140, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:01.765941', 'step': 140, 'epoch': 1} {'type': 'loss', 'content': 0.01973516307771206, 'timestamp': '2025-10-01 04:11:01.768961', 'step': 141, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:01.810879', 'step': 141, 'epoch': 1} {'type': 'loss', 'content': 0.02040678821504116, 'timestamp': '2025-10-01 04:11:01.813675', 'step': 142, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:01.846417', 'step': 142, 'epoch': 1} {'type': 'loss', 'content': 0.02637314982712269, 'timestamp': '2025-10-01 04:11:01.849125', 'step': 143, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:01.882598', 'step': 143, 'epoch': 1} {'type': 'loss', 'content': 0.015394523739814758, 'timestamp': '2025-10-01 04:11:01.908575', 'step': 144, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:01.941974', 'step': 144, 'epoch': 1} {'type': 'loss', 'content': 0.016578085720539093, 'timestamp': '2025-10-01 04:11:01.944314', 'step': 145, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:01.977437', 'step': 145, 'epoch': 1} {'type': 'loss', 'content': 0.013782672584056854, 'timestamp': '2025-10-01 04:11:01.980941', 'step': 146, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.015604', 'step': 146, 'epoch': 1} {'type': 'loss', 'content': 0.029022570699453354, 'timestamp': '2025-10-01 04:11:02.018395', 'step': 147, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.051534', 'step': 147, 'epoch': 1} {'type': 'loss', 'content': 0.030791154131293297, 'timestamp': '2025-10-01 04:11:02.075655', 'step': 148, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:02.109123', 'step': 148, 'epoch': 1} {'type': 'loss', 'content': 0.04107801243662834, 'timestamp': '2025-10-01 04:11:02.111665', 'step': 149, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.144488', 'step': 149, 'epoch': 1} {'type': 'loss', 'content': 0.041130490601062775, 'timestamp': '2025-10-01 04:11:02.147665', 'step': 150, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.183413', 'step': 150, 'epoch': 1} {'type': 'loss', 'content': 0.019808387383818626, 'timestamp': '2025-10-01 04:11:02.187533', 'step': 151, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.225379', 'step': 151, 'epoch': 1} {'type': 'loss', 'content': 0.007825485430657864, 'timestamp': '2025-10-01 04:11:02.251029', 'step': 152, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:02.286208', 'step': 152, 'epoch': 1} {'type': 'loss', 'content': 0.013624719344079494, 'timestamp': '2025-10-01 04:11:02.289379', 'step': 153, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:02.323189', 'step': 153, 'epoch': 1} {'type': 'loss', 'content': 0.027168406173586845, 'timestamp': '2025-10-01 04:11:02.325457', 'step': 154, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.356991', 'step': 154, 'epoch': 1} {'type': 'loss', 'content': 0.032501544803380966, 'timestamp': '2025-10-01 04:11:02.359146', 'step': 155, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.390044', 'step': 155, 'epoch': 1} {'type': 'loss', 'content': 0.021303342655301094, 'timestamp': '2025-10-01 04:11:02.413928', 'step': 156, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.445555', 'step': 156, 'epoch': 1} {'type': 'loss', 'content': 0.032604530453681946, 'timestamp': '2025-10-01 04:11:02.448009', 'step': 157, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.478434', 'step': 157, 'epoch': 1} {'type': 'loss', 'content': 0.05039301887154579, 'timestamp': '2025-10-01 04:11:02.480665', 'step': 158, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:02.511680', 'step': 158, 'epoch': 1} {'type': 'loss', 'content': 0.039981160312891006, 'timestamp': '2025-10-01 04:11:02.513943', 'step': 159, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.544777', 'step': 159, 'epoch': 1} {'type': 'loss', 'content': 0.028644157573580742, 'timestamp': '2025-10-01 04:11:02.568678', 'step': 160, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.599445', 'step': 160, 'epoch': 1} {'type': 'loss', 'content': 0.02378627099096775, 'timestamp': '2025-10-01 04:11:02.601720', 'step': 161, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.632948', 'step': 161, 'epoch': 1} {'type': 'loss', 'content': 0.0275709368288517, 'timestamp': '2025-10-01 04:11:02.635080', 'step': 162, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.666605', 'step': 162, 'epoch': 1} {'type': 'loss', 'content': 0.02510923705995083, 'timestamp': '2025-10-01 04:11:02.668605', 'step': 163, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:02.699142', 'step': 163, 'epoch': 1} {'type': 'loss', 'content': 0.022733593359589577, 'timestamp': '2025-10-01 04:11:02.723615', 'step': 164, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.755022', 'step': 164, 'epoch': 1} {'type': 'loss', 'content': 0.023119447752833366, 'timestamp': '2025-10-01 04:11:02.757374', 'step': 165, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:02.787948', 'step': 165, 'epoch': 1} {'type': 'loss', 'content': 0.03037671558558941, 'timestamp': '2025-10-01 04:11:02.790144', 'step': 166, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.822656', 'step': 166, 'epoch': 1} {'type': 'loss', 'content': 0.02896520495414734, 'timestamp': '2025-10-01 04:11:02.824787', 'step': 167, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.855535', 'step': 167, 'epoch': 1} {'type': 'loss', 'content': 0.025343691930174828, 'timestamp': '2025-10-01 04:11:02.879525', 'step': 168, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.910741', 'step': 168, 'epoch': 1} {'type': 'loss', 'content': 0.024414045736193657, 'timestamp': '2025-10-01 04:11:02.913245', 'step': 169, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:02.944286', 'step': 169, 'epoch': 1} {'type': 'loss', 'content': 0.029930496588349342, 'timestamp': '2025-10-01 04:11:02.946470', 'step': 170, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:02.976836', 'step': 170, 'epoch': 1} {'type': 'loss', 'content': 0.024483921006321907, 'timestamp': '2025-10-01 04:11:02.979151', 'step': 171, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:11:03.712425', 'step': 171, 'epoch': 1} {'type': 'pplx', 'content': 120769265.99599761, 'timestamp': '2025-10-01 04:11:03.714527', 'step': 171, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:03.748965', 'step': 171, 'epoch': 1} {'type': 'loss', 'content': 0.024266691878437996, 'timestamp': '2025-10-01 04:11:03.772876', 'step': 172, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:03.804998', 'step': 172, 'epoch': 1} {'type': 'loss', 'content': 0.02391829900443554, 'timestamp': '2025-10-01 04:11:03.807806', 'step': 173, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:03.843090', 'step': 173, 'epoch': 1} {'type': 'loss', 'content': 0.023773299530148506, 'timestamp': '2025-10-01 04:11:03.845386', 'step': 174, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:03.880001', 'step': 174, 'epoch': 1} {'type': 'loss', 'content': 0.02384672500193119, 'timestamp': '2025-10-01 04:11:03.882347', 'step': 175, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:03.915356', 'step': 175, 'epoch': 1} {'type': 'loss', 'content': 0.025520270690321922, 'timestamp': '2025-10-01 04:11:03.939925', 'step': 176, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:03.973267', 'step': 176, 'epoch': 1} {'type': 'loss', 'content': 0.018843820318579674, 'timestamp': '2025-10-01 04:11:03.976044', 'step': 177, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:04.008928', 'step': 177, 'epoch': 1} {'type': 'loss', 'content': 0.018038298934698105, 'timestamp': '2025-10-01 04:11:04.011534', 'step': 178, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:04.047044', 'step': 178, 'epoch': 1} {'type': 'loss', 'content': 0.01665923371911049, 'timestamp': '2025-10-01 04:11:04.049664', 'step': 179, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:04.081583', 'step': 179, 'epoch': 1} {'type': 'loss', 'content': 0.024666251614689827, 'timestamp': '2025-10-01 04:11:04.105542', 'step': 180, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:04.139707', 'step': 180, 'epoch': 1} {'type': 'loss', 'content': 0.020171239972114563, 'timestamp': '2025-10-01 04:11:04.144480', 'step': 181, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:04.180714', 'step': 181, 'epoch': 1} {'type': 'loss', 'content': 0.030821431428194046, 'timestamp': '2025-10-01 04:11:04.184291', 'step': 182, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:04.221497', 'step': 182, 'epoch': 1} {'type': 'loss', 'content': 0.05390426889061928, 'timestamp': '2025-10-01 04:11:04.225907', 'step': 183, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:04.263174', 'step': 183, 'epoch': 1} {'type': 'loss', 'content': 0.03358237445354462, 'timestamp': '2025-10-01 04:11:04.289628', 'step': 184, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:04.327093', 'step': 184, 'epoch': 1} {'type': 'loss', 'content': 0.04122252017259598, 'timestamp': '2025-10-01 04:11:04.331695', 'step': 185, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:04.369519', 'step': 185, 'epoch': 1} {'type': 'loss', 'content': 0.021640826016664505, 'timestamp': '2025-10-01 04:11:04.372368', 'step': 186, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:04.406874', 'step': 186, 'epoch': 1} {'type': 'loss', 'content': 0.03316246345639229, 'timestamp': '2025-10-01 04:11:04.410071', 'step': 187, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:04.444007', 'step': 187, 'epoch': 1} {'type': 'loss', 'content': 0.027355531230568886, 'timestamp': '2025-10-01 04:11:04.469025', 'step': 188, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:04.506061', 'step': 188, 'epoch': 1} {'type': 'loss', 'content': 0.017095940187573433, 'timestamp': '2025-10-01 04:11:04.509501', 'step': 189, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:04.545675', 'step': 189, 'epoch': 1} {'type': 'loss', 'content': 0.021681906655430794, 'timestamp': '2025-10-01 04:11:04.549775', 'step': 190, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:04.588467', 'step': 190, 'epoch': 1} {'type': 'loss', 'content': 0.019009903073310852, 'timestamp': '2025-10-01 04:11:04.592739', 'step': 191, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:04.628572', 'step': 191, 'epoch': 1} {'type': 'loss', 'content': 0.020892620086669922, 'timestamp': '2025-10-01 04:11:04.653945', 'step': 192, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:04.690114', 'step': 192, 'epoch': 1} {'type': 'loss', 'content': 0.00846900511533022, 'timestamp': '2025-10-01 04:11:04.694251', 'step': 193, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:04.730830', 'step': 193, 'epoch': 1} {'type': 'loss', 'content': 0.015313724987208843, 'timestamp': '2025-10-01 04:11:04.734818', 'step': 194, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:04.772292', 'step': 194, 'epoch': 1} {'type': 'loss', 'content': 0.017627518624067307, 'timestamp': '2025-10-01 04:11:04.776483', 'step': 195, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:04.814091', 'step': 195, 'epoch': 1} {'type': 'loss', 'content': 0.018919754773378372, 'timestamp': '2025-10-01 04:11:04.839417', 'step': 196, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:04.875244', 'step': 196, 'epoch': 1} {'type': 'loss', 'content': 0.00864213053137064, 'timestamp': '2025-10-01 04:11:04.879592', 'step': 197, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:04.916555', 'step': 197, 'epoch': 1} {'type': 'loss', 'content': 0.021550923585891724, 'timestamp': '2025-10-01 04:11:04.920440', 'step': 198, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:04.957114', 'step': 198, 'epoch': 1} {'type': 'loss', 'content': 0.04059647396206856, 'timestamp': '2025-10-01 04:11:04.960365', 'step': 199, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:04.995341', 'step': 199, 'epoch': 1} {'type': 'loss', 'content': 0.03375014290213585, 'timestamp': '2025-10-01 04:11:05.020901', 'step': 200, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:05.057601', 'step': 200, 'epoch': 1} {'type': 'loss', 'content': 0.015561387874186039, 'timestamp': '2025-10-01 04:11:05.061688', 'step': 201, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:05.100943', 'step': 201, 'epoch': 1} {'type': 'loss', 'content': 0.019198428839445114, 'timestamp': '2025-10-01 04:11:05.104906', 'step': 202, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:05.141443', 'step': 202, 'epoch': 1} {'type': 'loss', 'content': 0.020807581022381783, 'timestamp': '2025-10-01 04:11:05.146759', 'step': 203, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:05.187110', 'step': 203, 'epoch': 1} {'type': 'loss', 'content': 0.021419154480099678, 'timestamp': '2025-10-01 04:11:05.214280', 'step': 204, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:05.260220', 'step': 204, 'epoch': 1} {'type': 'loss', 'content': 0.025552844628691673, 'timestamp': '2025-10-01 04:11:05.266775', 'step': 205, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:05.310785', 'step': 205, 'epoch': 1} {'type': 'loss', 'content': 0.00911114364862442, 'timestamp': '2025-10-01 04:11:05.318082', 'step': 206, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:05.363217', 'step': 206, 'epoch': 1} {'type': 'loss', 'content': 0.03169438615441322, 'timestamp': '2025-10-01 04:11:05.369286', 'step': 207, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:05.415227', 'step': 207, 'epoch': 1} {'type': 'loss', 'content': 0.01915550045669079, 'timestamp': '2025-10-01 04:11:05.443789', 'step': 208, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:05.493723', 'step': 208, 'epoch': 1} {'type': 'loss', 'content': 0.029754187911748886, 'timestamp': '2025-10-01 04:11:05.498853', 'step': 209, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:05.536518', 'step': 209, 'epoch': 1} {'type': 'loss', 'content': 0.03260469064116478, 'timestamp': '2025-10-01 04:11:05.541058', 'step': 210, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:05.581991', 'step': 210, 'epoch': 1} {'type': 'loss', 'content': 0.017987538129091263, 'timestamp': '2025-10-01 04:11:05.586146', 'step': 211, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:05.626067', 'step': 211, 'epoch': 1} {'type': 'loss', 'content': 0.03934092074632645, 'timestamp': '2025-10-01 04:11:05.652596', 'step': 212, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:05.693603', 'step': 212, 'epoch': 1} {'type': 'loss', 'content': 0.028932850807905197, 'timestamp': '2025-10-01 04:11:05.697708', 'step': 213, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:05.735723', 'step': 213, 'epoch': 1} {'type': 'loss', 'content': 0.020509885624051094, 'timestamp': '2025-10-01 04:11:05.739455', 'step': 214, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:05.775868', 'step': 214, 'epoch': 1} {'type': 'loss', 'content': 0.023546868935227394, 'timestamp': '2025-10-01 04:11:05.779786', 'step': 215, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:05.817622', 'step': 215, 'epoch': 1} {'type': 'loss', 'content': 0.024636918678879738, 'timestamp': '2025-10-01 04:11:05.843730', 'step': 216, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:05.881921', 'step': 216, 'epoch': 1} {'type': 'loss', 'content': 0.025502773001790047, 'timestamp': '2025-10-01 04:11:05.885295', 'step': 217, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:05.921181', 'step': 217, 'epoch': 1} {'type': 'loss', 'content': 0.029600782319903374, 'timestamp': '2025-10-01 04:11:05.926059', 'step': 218, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:05.962824', 'step': 218, 'epoch': 1} {'type': 'loss', 'content': 0.028552714735269547, 'timestamp': '2025-10-01 04:11:05.969938', 'step': 219, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:06.007101', 'step': 219, 'epoch': 1} {'type': 'loss', 'content': 0.021603135392069817, 'timestamp': '2025-10-01 04:11:06.033295', 'step': 220, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:06.069385', 'step': 220, 'epoch': 1} {'type': 'loss', 'content': 0.021389275789260864, 'timestamp': '2025-10-01 04:11:06.072922', 'step': 221, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:06.109361', 'step': 221, 'epoch': 1} {'type': 'loss', 'content': 0.023018766194581985, 'timestamp': '2025-10-01 04:11:06.113062', 'step': 222, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:06.151821', 'step': 222, 'epoch': 1} {'type': 'loss', 'content': 0.023001650348305702, 'timestamp': '2025-10-01 04:11:06.155483', 'step': 223, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:06.195371', 'step': 223, 'epoch': 1} {'type': 'loss', 'content': 0.026992013677954674, 'timestamp': '2025-10-01 04:11:06.221985', 'step': 224, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:06.264315', 'step': 224, 'epoch': 1} {'type': 'loss', 'content': 0.020342260599136353, 'timestamp': '2025-10-01 04:11:06.269341', 'step': 225, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:06.309157', 'step': 225, 'epoch': 1} {'type': 'loss', 'content': 0.022230759263038635, 'timestamp': '2025-10-01 04:11:06.314884', 'step': 226, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:06.355713', 'step': 226, 'epoch': 1} {'type': 'loss', 'content': 0.02584499679505825, 'timestamp': '2025-10-01 04:11:06.359995', 'step': 227, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:06.398943', 'step': 227, 'epoch': 1} {'type': 'loss', 'content': 0.028999576345086098, 'timestamp': '2025-10-01 04:11:06.426940', 'step': 228, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:11:07.366202', 'step': 228, 'epoch': 1} {'type': 'pplx', 'content': 122809426.57628937, 'timestamp': '2025-10-01 04:11:07.369426', 'step': 228, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:07.399317', 'step': 228, 'epoch': 1} {'type': 'loss', 'content': 0.023775307461619377, 'timestamp': '2025-10-01 04:11:07.404664', 'step': 229, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:07.442257', 'step': 229, 'epoch': 1} {'type': 'loss', 'content': 0.026211675256490707, 'timestamp': '2025-10-01 04:11:07.446943', 'step': 230, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:07.485575', 'step': 230, 'epoch': 1} {'type': 'loss', 'content': 0.026004331186413765, 'timestamp': '2025-10-01 04:11:07.489770', 'step': 231, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:07.524818', 'step': 231, 'epoch': 1} {'type': 'loss', 'content': 0.027609150856733322, 'timestamp': '2025-10-01 04:11:07.549932', 'step': 232, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:07.587581', 'step': 232, 'epoch': 1} {'type': 'loss', 'content': 0.02476361393928528, 'timestamp': '2025-10-01 04:11:07.592182', 'step': 233, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:07.628455', 'step': 233, 'epoch': 1} {'type': 'loss', 'content': 0.023204132914543152, 'timestamp': '2025-10-01 04:11:07.638754', 'step': 234, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:07.688423', 'step': 234, 'epoch': 1} {'type': 'loss', 'content': 0.022222131490707397, 'timestamp': '2025-10-01 04:11:07.692520', 'step': 235, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:07.728492', 'step': 235, 'epoch': 1} {'type': 'loss', 'content': 0.02399165742099285, 'timestamp': '2025-10-01 04:11:07.755397', 'step': 236, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:07.792695', 'step': 236, 'epoch': 1} {'type': 'loss', 'content': 0.01936420612037182, 'timestamp': '2025-10-01 04:11:07.796632', 'step': 237, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:07.832715', 'step': 237, 'epoch': 1} {'type': 'loss', 'content': 0.027001459151506424, 'timestamp': '2025-10-01 04:11:07.835200', 'step': 238, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:07.870834', 'step': 238, 'epoch': 1} {'type': 'loss', 'content': 0.03520625829696655, 'timestamp': '2025-10-01 04:11:07.874883', 'step': 239, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:07.915586', 'step': 239, 'epoch': 1} {'type': 'loss', 'content': 0.02217874489724636, 'timestamp': '2025-10-01 04:11:07.947022', 'step': 240, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:07.984484', 'step': 240, 'epoch': 1} {'type': 'loss', 'content': 0.017390580847859383, 'timestamp': '2025-10-01 04:11:07.988482', 'step': 241, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:08.026755', 'step': 241, 'epoch': 1} {'type': 'loss', 'content': 0.011072313413023949, 'timestamp': '2025-10-01 04:11:08.030594', 'step': 242, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:08.065665', 'step': 242, 'epoch': 1} {'type': 'loss', 'content': 0.00991674792021513, 'timestamp': '2025-10-01 04:11:08.069462', 'step': 243, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:08.105643', 'step': 243, 'epoch': 1} {'type': 'loss', 'content': 0.015564543195068836, 'timestamp': '2025-10-01 04:11:08.131957', 'step': 244, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:08.169739', 'step': 244, 'epoch': 1} {'type': 'loss', 'content': 0.044475387781858444, 'timestamp': '2025-10-01 04:11:08.180027', 'step': 245, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:08.230583', 'step': 245, 'epoch': 1} {'type': 'loss', 'content': 0.042544569820165634, 'timestamp': '2025-10-01 04:11:08.235633', 'step': 246, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:08.275466', 'step': 246, 'epoch': 1} {'type': 'loss', 'content': 0.03518744558095932, 'timestamp': '2025-10-01 04:11:08.281910', 'step': 247, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:08.323556', 'step': 247, 'epoch': 1} {'type': 'loss', 'content': 0.03052806854248047, 'timestamp': '2025-10-01 04:11:08.351201', 'step': 248, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:08.391918', 'step': 248, 'epoch': 1} {'type': 'loss', 'content': 0.04761296510696411, 'timestamp': '2025-10-01 04:11:08.400264', 'step': 249, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:08.454453', 'step': 249, 'epoch': 1} {'type': 'loss', 'content': 0.015664685517549515, 'timestamp': '2025-10-01 04:11:08.457280', 'step': 250, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:08.496811', 'step': 250, 'epoch': 1} {'type': 'loss', 'content': 0.03577407822012901, 'timestamp': '2025-10-01 04:11:08.502152', 'step': 251, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:08.544633', 'step': 251, 'epoch': 1} {'type': 'loss', 'content': 0.007992579601705074, 'timestamp': '2025-10-01 04:11:08.572372', 'step': 252, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:08.612756', 'step': 252, 'epoch': 1} {'type': 'loss', 'content': 0.00690767215564847, 'timestamp': '2025-10-01 04:11:08.617787', 'step': 253, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:08.657444', 'step': 253, 'epoch': 1} {'type': 'loss', 'content': 0.031856562942266464, 'timestamp': '2025-10-01 04:11:08.661796', 'step': 254, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:08.702019', 'step': 254, 'epoch': 1} {'type': 'loss', 'content': 0.014500172808766365, 'timestamp': '2025-10-01 04:11:08.706666', 'step': 255, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:11:08.745934', 'step': 255, 'epoch': 1} {'type': 'loss', 'content': 0.023240460082888603, 'timestamp': '2025-10-01 04:11:08.772639', 'step': 256, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:08.815253', 'step': 256, 'epoch': 1} {'type': 'loss', 'content': 0.010203330777585506, 'timestamp': '2025-10-01 04:11:08.819684', 'step': 257, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:08.857505', 'step': 257, 'epoch': 1} {'type': 'loss', 'content': 0.04106441140174866, 'timestamp': '2025-10-01 04:11:08.862585', 'step': 258, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:08.902139', 'step': 258, 'epoch': 1} {'type': 'loss', 'content': 0.028442902490496635, 'timestamp': '2025-10-01 04:11:08.907438', 'step': 259, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:08.948737', 'step': 259, 'epoch': 1} {'type': 'loss', 'content': 0.01924443431198597, 'timestamp': '2025-10-01 04:11:08.976364', 'step': 260, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:09.014149', 'step': 260, 'epoch': 1} {'type': 'loss', 'content': 0.02226688340306282, 'timestamp': '2025-10-01 04:11:09.019034', 'step': 261, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:09.058548', 'step': 261, 'epoch': 1} {'type': 'loss', 'content': 0.02148137055337429, 'timestamp': '2025-10-01 04:11:09.063563', 'step': 262, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:09.103810', 'step': 262, 'epoch': 1} {'type': 'loss', 'content': 0.021183112636208534, 'timestamp': '2025-10-01 04:11:09.109446', 'step': 263, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:09.149647', 'step': 263, 'epoch': 1} {'type': 'loss', 'content': 0.012026933021843433, 'timestamp': '2025-10-01 04:11:09.177157', 'step': 264, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:09.222469', 'step': 264, 'epoch': 1} {'type': 'loss', 'content': 0.022156137973070145, 'timestamp': '2025-10-01 04:11:09.229074', 'step': 265, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:09.271871', 'step': 265, 'epoch': 1} {'type': 'loss', 'content': 0.022157371044158936, 'timestamp': '2025-10-01 04:11:09.276591', 'step': 266, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:09.313687', 'step': 266, 'epoch': 1} {'type': 'loss', 'content': 0.02283759042620659, 'timestamp': '2025-10-01 04:11:09.326584', 'step': 267, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:09.381184', 'step': 267, 'epoch': 1} {'type': 'loss', 'content': 0.0349494144320488, 'timestamp': '2025-10-01 04:11:09.412786', 'step': 268, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:09.468443', 'step': 268, 'epoch': 1} {'type': 'loss', 'content': 0.01891479082405567, 'timestamp': '2025-10-01 04:11:09.479449', 'step': 269, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:09.539317', 'step': 269, 'epoch': 1} {'type': 'loss', 'content': 0.022123845294117928, 'timestamp': '2025-10-01 04:11:09.548528', 'step': 270, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:09.606951', 'step': 270, 'epoch': 1} {'type': 'loss', 'content': 0.022634102031588554, 'timestamp': '2025-10-01 04:11:09.615883', 'step': 271, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:09.664869', 'step': 271, 'epoch': 1} {'type': 'loss', 'content': 0.021512528881430626, 'timestamp': '2025-10-01 04:11:09.695452', 'step': 272, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:09.744714', 'step': 272, 'epoch': 1} {'type': 'loss', 'content': 0.022766796872019768, 'timestamp': '2025-10-01 04:11:09.754152', 'step': 273, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:09.809403', 'step': 273, 'epoch': 1} {'type': 'loss', 'content': 0.02063952013850212, 'timestamp': '2025-10-01 04:11:09.817055', 'step': 274, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:09.865832', 'step': 274, 'epoch': 1} {'type': 'loss', 'content': 0.02158406563103199, 'timestamp': '2025-10-01 04:11:09.873314', 'step': 275, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:09.927481', 'step': 275, 'epoch': 1} {'type': 'loss', 'content': 0.02682666666805744, 'timestamp': '2025-10-01 04:11:09.958874', 'step': 276, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:10.004073', 'step': 276, 'epoch': 1} {'type': 'loss', 'content': 0.017530372366309166, 'timestamp': '2025-10-01 04:11:10.013130', 'step': 277, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:10.060282', 'step': 277, 'epoch': 1} {'type': 'loss', 'content': 0.026719707995653152, 'timestamp': '2025-10-01 04:11:10.067797', 'step': 278, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:10.117019', 'step': 278, 'epoch': 1} {'type': 'loss', 'content': 0.024190085008740425, 'timestamp': '2025-10-01 04:11:10.125956', 'step': 279, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:10.177319', 'step': 279, 'epoch': 1} {'type': 'loss', 'content': 0.03567364439368248, 'timestamp': '2025-10-01 04:11:10.209699', 'step': 280, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:10.256584', 'step': 280, 'epoch': 1} {'type': 'loss', 'content': 0.01976270042359829, 'timestamp': '2025-10-01 04:11:10.289482', 'step': 281, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:10.326949', 'step': 281, 'epoch': 1} {'type': 'loss', 'content': 0.018775565549731255, 'timestamp': '2025-10-01 04:11:10.337053', 'step': 282, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:10.385431', 'step': 282, 'epoch': 1} {'type': 'loss', 'content': 0.022568348795175552, 'timestamp': '2025-10-01 04:11:10.394946', 'step': 283, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:10.449806', 'step': 283, 'epoch': 1} {'type': 'loss', 'content': 0.029433075338602066, 'timestamp': '2025-10-01 04:11:10.481502', 'step': 284, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:10.541012', 'step': 284, 'epoch': 1} {'type': 'loss', 'content': 0.02517537772655487, 'timestamp': '2025-10-01 04:11:10.551862', 'step': 285, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:11:12.023011', 'step': 285, 'epoch': 1} {'type': 'pplx', 'content': 123458419.66125065, 'timestamp': '2025-10-01 04:11:12.033811', 'step': 285, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:12.075459', 'step': 285, 'epoch': 1} {'type': 'loss', 'content': 0.01756308041512966, 'timestamp': '2025-10-01 04:11:12.086183', 'step': 286, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:11:12.139662', 'step': 286, 'epoch': 1} {'type': 'loss', 'content': 0.021046658977866173, 'timestamp': '2025-10-01 04:11:12.150758', 'step': 287, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:12.199184', 'step': 287, 'epoch': 1} {'type': 'loss', 'content': 0.02669466845691204, 'timestamp': '2025-10-01 04:11:12.231814', 'step': 288, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:12.266605', 'step': 288, 'epoch': 1} {'type': 'loss', 'content': 0.025276560336351395, 'timestamp': '2025-10-01 04:11:12.275277', 'step': 289, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:12.325581', 'step': 289, 'epoch': 1} {'type': 'loss', 'content': 0.017593776807188988, 'timestamp': '2025-10-01 04:11:12.336113', 'step': 290, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:12.385996', 'step': 290, 'epoch': 1} {'type': 'loss', 'content': 0.02213350310921669, 'timestamp': '2025-10-01 04:11:12.389863', 'step': 291, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:12.441883', 'step': 291, 'epoch': 1} {'type': 'loss', 'content': 0.024864699691534042, 'timestamp': '2025-10-01 04:11:12.469387', 'step': 292, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:12.527746', 'step': 292, 'epoch': 1} {'type': 'loss', 'content': 0.018032578751444817, 'timestamp': '2025-10-01 04:11:12.531862', 'step': 293, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:12.592939', 'step': 293, 'epoch': 1} {'type': 'loss', 'content': 0.025712715461850166, 'timestamp': '2025-10-01 04:11:12.598654', 'step': 294, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:12.644955', 'step': 294, 'epoch': 1} {'type': 'loss', 'content': 0.021214189007878304, 'timestamp': '2025-10-01 04:11:12.649453', 'step': 295, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:12.713329', 'step': 295, 'epoch': 1} {'type': 'loss', 'content': 0.017967000603675842, 'timestamp': '2025-10-01 04:11:12.739521', 'step': 296, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:12.801217', 'step': 296, 'epoch': 1} {'type': 'loss', 'content': 0.020910972729325294, 'timestamp': '2025-10-01 04:11:12.808802', 'step': 297, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:12.866167', 'step': 297, 'epoch': 1} {'type': 'loss', 'content': 0.025297032669186592, 'timestamp': '2025-10-01 04:11:12.869763', 'step': 298, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:12.925348', 'step': 298, 'epoch': 1} {'type': 'loss', 'content': 0.01858631707727909, 'timestamp': '2025-10-01 04:11:12.936219', 'step': 299, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:12.992095', 'step': 299, 'epoch': 1} {'type': 'loss', 'content': 0.018205394968390465, 'timestamp': '2025-10-01 04:11:13.024923', 'step': 300, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:13.068494', 'step': 300, 'epoch': 1} {'type': 'loss', 'content': 0.02749958448112011, 'timestamp': '2025-10-01 04:11:13.071592', 'step': 301, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:13.122732', 'step': 301, 'epoch': 1} {'type': 'loss', 'content': 0.012391953729093075, 'timestamp': '2025-10-01 04:11:13.132353', 'step': 302, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:13.185938', 'step': 302, 'epoch': 1} {'type': 'loss', 'content': 0.01051428634673357, 'timestamp': '2025-10-01 04:11:13.195347', 'step': 303, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:13.242587', 'step': 303, 'epoch': 1} {'type': 'loss', 'content': 0.017451424151659012, 'timestamp': '2025-10-01 04:11:13.267884', 'step': 304, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:13.314376', 'step': 304, 'epoch': 1} {'type': 'loss', 'content': 0.019919848069548607, 'timestamp': '2025-10-01 04:11:13.317479', 'step': 305, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:13.366986', 'step': 305, 'epoch': 1} {'type': 'loss', 'content': 0.03193918988108635, 'timestamp': '2025-10-01 04:11:13.371398', 'step': 306, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:13.424406', 'step': 306, 'epoch': 1} {'type': 'loss', 'content': 0.030981769785284996, 'timestamp': '2025-10-01 04:11:13.434069', 'step': 307, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:13.477614', 'step': 307, 'epoch': 1} {'type': 'loss', 'content': 0.020484397187829018, 'timestamp': '2025-10-01 04:11:13.508874', 'step': 308, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:13.563311', 'step': 308, 'epoch': 1} {'type': 'loss', 'content': 0.006139458157122135, 'timestamp': '2025-10-01 04:11:13.567099', 'step': 309, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:13.605957', 'step': 309, 'epoch': 1} {'type': 'loss', 'content': 0.04061734676361084, 'timestamp': '2025-10-01 04:11:13.615570', 'step': 310, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:13.661155', 'step': 310, 'epoch': 1} {'type': 'loss', 'content': 0.03407382592558861, 'timestamp': '2025-10-01 04:11:13.665953', 'step': 311, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:13.713417', 'step': 311, 'epoch': 1} {'type': 'loss', 'content': 0.018781576305627823, 'timestamp': '2025-10-01 04:11:13.743162', 'step': 312, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:13.789236', 'step': 312, 'epoch': 1} {'type': 'loss', 'content': 0.02269078977406025, 'timestamp': '2025-10-01 04:11:13.797855', 'step': 313, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:13.832261', 'step': 313, 'epoch': 1} {'type': 'loss', 'content': 0.017462121322751045, 'timestamp': '2025-10-01 04:11:13.836397', 'step': 314, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:13.879321', 'step': 314, 'epoch': 1} {'type': 'loss', 'content': 0.0337679348886013, 'timestamp': '2025-10-01 04:11:13.882439', 'step': 315, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:13.928027', 'step': 315, 'epoch': 1} {'type': 'loss', 'content': 0.005852936767041683, 'timestamp': '2025-10-01 04:11:13.953685', 'step': 316, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:14.012234', 'step': 316, 'epoch': 1} {'type': 'loss', 'content': 0.015508134849369526, 'timestamp': '2025-10-01 04:11:14.016350', 'step': 317, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:14.063073', 'step': 317, 'epoch': 1} {'type': 'loss', 'content': 0.030224744230508804, 'timestamp': '2025-10-01 04:11:14.073973', 'step': 318, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:14.128368', 'step': 318, 'epoch': 1} {'type': 'loss', 'content': 0.01716790907084942, 'timestamp': '2025-10-01 04:11:14.137149', 'step': 319, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:14.181796', 'step': 319, 'epoch': 1} {'type': 'loss', 'content': 0.020314916968345642, 'timestamp': '2025-10-01 04:11:14.207770', 'step': 320, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:14.243735', 'step': 320, 'epoch': 1} {'type': 'loss', 'content': 0.02022932842373848, 'timestamp': '2025-10-01 04:11:14.252250', 'step': 321, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:14.287561', 'step': 321, 'epoch': 1} {'type': 'loss', 'content': 0.03518899530172348, 'timestamp': '2025-10-01 04:11:14.292085', 'step': 322, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:14.329769', 'step': 322, 'epoch': 1} {'type': 'loss', 'content': 0.03895551338791847, 'timestamp': '2025-10-01 04:11:14.333485', 'step': 323, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:14.381017', 'step': 323, 'epoch': 1} {'type': 'loss', 'content': 0.030202005058526993, 'timestamp': '2025-10-01 04:11:14.413907', 'step': 324, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:14.451810', 'step': 324, 'epoch': 1} {'type': 'loss', 'content': 0.02211807295680046, 'timestamp': '2025-10-01 04:11:14.459989', 'step': 325, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:14.508668', 'step': 325, 'epoch': 1} {'type': 'loss', 'content': 0.025438370183110237, 'timestamp': '2025-10-01 04:11:14.518422', 'step': 326, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:14.568591', 'step': 326, 'epoch': 1} {'type': 'loss', 'content': 0.028444336727261543, 'timestamp': '2025-10-01 04:11:14.571132', 'step': 327, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:11:14.621409', 'step': 327, 'epoch': 1} {'type': 'loss', 'content': 0.026081737130880356, 'timestamp': '2025-10-01 04:11:14.651185', 'step': 328, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:14.689696', 'step': 328, 'epoch': 1} {'type': 'loss', 'content': 0.026819607242941856, 'timestamp': '2025-10-01 04:11:14.701449', 'step': 329, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:14.758718', 'step': 329, 'epoch': 1} {'type': 'loss', 'content': 0.024707946926355362, 'timestamp': '2025-10-01 04:11:14.773714', 'step': 330, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:14.837086', 'step': 330, 'epoch': 1} {'type': 'loss', 'content': 0.021115347743034363, 'timestamp': '2025-10-01 04:11:14.850224', 'step': 331, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:14.913208', 'step': 331, 'epoch': 1} {'type': 'loss', 'content': 0.022157754749059677, 'timestamp': '2025-10-01 04:11:14.947292', 'step': 332, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:15.016196', 'step': 332, 'epoch': 1} {'type': 'loss', 'content': 0.02936716005206108, 'timestamp': '2025-10-01 04:11:15.028300', 'step': 333, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:15.082357', 'step': 333, 'epoch': 1} {'type': 'loss', 'content': 0.0242962297052145, 'timestamp': '2025-10-01 04:11:15.093062', 'step': 334, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:15.152137', 'step': 334, 'epoch': 1} {'type': 'loss', 'content': 0.024363428354263306, 'timestamp': '2025-10-01 04:11:15.161507', 'step': 335, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:11:15.215923', 'step': 335, 'epoch': 1} {'type': 'loss', 'content': 0.030734961852431297, 'timestamp': '2025-10-01 04:11:15.247500', 'step': 336, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:15.290273', 'step': 336, 'epoch': 1} {'type': 'loss', 'content': 0.03119891881942749, 'timestamp': '2025-10-01 04:11:15.298587', 'step': 337, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:15.352209', 'step': 337, 'epoch': 1} {'type': 'loss', 'content': 0.027752574533224106, 'timestamp': '2025-10-01 04:11:15.361906', 'step': 338, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:15.419423', 'step': 338, 'epoch': 1} {'type': 'loss', 'content': 0.023446708917617798, 'timestamp': '2025-10-01 04:11:15.423173', 'step': 339, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:15.478963', 'step': 339, 'epoch': 1} {'type': 'loss', 'content': 0.02432301454246044, 'timestamp': '2025-10-01 04:11:15.510136', 'step': 340, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:15.563526', 'step': 340, 'epoch': 1} {'type': 'loss', 'content': 0.0277759600430727, 'timestamp': '2025-10-01 04:11:15.571468', 'step': 341, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:15.626770', 'step': 341, 'epoch': 1} {'type': 'loss', 'content': 0.026494231075048447, 'timestamp': '2025-10-01 04:11:15.636723', 'step': 342, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:11:16.887801', 'step': 342, 'epoch': 1} {'type': 'pplx', 'content': 119641692.44329666, 'timestamp': '2025-10-01 04:11:16.892772', 'step': 342, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:16.929417', 'step': 342, 'epoch': 1} {'type': 'loss', 'content': 0.025429880246520042, 'timestamp': '2025-10-01 04:11:16.934622', 'step': 343, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:16.976165', 'step': 343, 'epoch': 1} {'type': 'loss', 'content': 0.02810918726027012, 'timestamp': '2025-10-01 04:11:17.000203', 'step': 344, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:17.036728', 'step': 344, 'epoch': 1} {'type': 'loss', 'content': 0.02276870794594288, 'timestamp': '2025-10-01 04:11:17.042121', 'step': 345, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:17.088648', 'step': 345, 'epoch': 1} {'type': 'loss', 'content': 0.02521303854882717, 'timestamp': '2025-10-01 04:11:17.095138', 'step': 346, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:17.150265', 'step': 346, 'epoch': 1} {'type': 'loss', 'content': 0.0223286934196949, 'timestamp': '2025-10-01 04:11:17.161778', 'step': 347, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:17.224552', 'step': 347, 'epoch': 1} {'type': 'loss', 'content': 0.02348393388092518, 'timestamp': '2025-10-01 04:11:17.259066', 'step': 348, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:17.319123', 'step': 348, 'epoch': 1} {'type': 'loss', 'content': 0.019929800182580948, 'timestamp': '2025-10-01 04:11:17.328584', 'step': 349, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:17.376797', 'step': 349, 'epoch': 1} {'type': 'loss', 'content': 0.024558162316679955, 'timestamp': '2025-10-01 04:11:17.379457', 'step': 350, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:17.415822', 'step': 350, 'epoch': 1} {'type': 'loss', 'content': 0.027435820549726486, 'timestamp': '2025-10-01 04:11:17.419765', 'step': 351, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:17.458325', 'step': 351, 'epoch': 1} {'type': 'loss', 'content': 0.025386372581124306, 'timestamp': '2025-10-01 04:11:17.486734', 'step': 352, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:17.524048', 'step': 352, 'epoch': 1} {'type': 'loss', 'content': 0.028569668531417847, 'timestamp': '2025-10-01 04:11:17.533496', 'step': 353, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:17.589555', 'step': 353, 'epoch': 1} {'type': 'loss', 'content': 0.023457257077097893, 'timestamp': '2025-10-01 04:11:17.598781', 'step': 354, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:17.645443', 'step': 354, 'epoch': 1} {'type': 'loss', 'content': 0.025979798287153244, 'timestamp': '2025-10-01 04:11:17.648451', 'step': 355, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:17.688180', 'step': 355, 'epoch': 1} {'type': 'loss', 'content': 0.01912418194115162, 'timestamp': '2025-10-01 04:11:17.718475', 'step': 356, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:17.764883', 'step': 356, 'epoch': 1} {'type': 'loss', 'content': 0.020123664289712906, 'timestamp': '2025-10-01 04:11:17.768559', 'step': 357, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:17.821188', 'step': 357, 'epoch': 1} {'type': 'loss', 'content': 0.017067687585949898, 'timestamp': '2025-10-01 04:11:17.824440', 'step': 358, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:17.865217', 'step': 358, 'epoch': 1} {'type': 'loss', 'content': 0.015169544145464897, 'timestamp': '2025-10-01 04:11:17.876455', 'step': 359, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:17.922614', 'step': 359, 'epoch': 1} {'type': 'loss', 'content': 0.021369751542806625, 'timestamp': '2025-10-01 04:11:17.951034', 'step': 360, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:17.990032', 'step': 360, 'epoch': 1} {'type': 'loss', 'content': 0.025677619501948357, 'timestamp': '2025-10-01 04:11:17.992682', 'step': 361, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:18.029442', 'step': 361, 'epoch': 1} {'type': 'loss', 'content': 0.03131221607327461, 'timestamp': '2025-10-01 04:11:18.036011', 'step': 362, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:18.070878', 'step': 362, 'epoch': 1} {'type': 'loss', 'content': 0.02442021481692791, 'timestamp': '2025-10-01 04:11:18.077546', 'step': 363, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:18.115727', 'step': 363, 'epoch': 1} {'type': 'loss', 'content': 0.013035761192440987, 'timestamp': '2025-10-01 04:11:18.140492', 'step': 364, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:18.184440', 'step': 364, 'epoch': 1} {'type': 'loss', 'content': 0.02659951150417328, 'timestamp': '2025-10-01 04:11:18.187011', 'step': 365, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:18.223164', 'step': 365, 'epoch': 1} {'type': 'loss', 'content': 0.016170557588338852, 'timestamp': '2025-10-01 04:11:18.229705', 'step': 366, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:11:18.274709', 'step': 366, 'epoch': 1} {'type': 'loss', 'content': 0.018221816048026085, 'timestamp': '2025-10-01 04:11:18.282506', 'step': 367, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:18.327114', 'step': 367, 'epoch': 1} {'type': 'loss', 'content': 0.02146676555275917, 'timestamp': '2025-10-01 04:11:18.354184', 'step': 368, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:18.389949', 'step': 368, 'epoch': 1} {'type': 'loss', 'content': 0.025201361626386642, 'timestamp': '2025-10-01 04:11:18.395673', 'step': 369, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:18.437111', 'step': 369, 'epoch': 1} {'type': 'loss', 'content': 0.026593759655952454, 'timestamp': '2025-10-01 04:11:18.443651', 'step': 370, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:18.486574', 'step': 370, 'epoch': 1} {'type': 'loss', 'content': 0.01598348841071129, 'timestamp': '2025-10-01 04:11:18.493550', 'step': 371, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:18.535039', 'step': 371, 'epoch': 1} {'type': 'loss', 'content': 0.030138777568936348, 'timestamp': '2025-10-01 04:11:18.562546', 'step': 372, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:18.603815', 'step': 372, 'epoch': 1} {'type': 'loss', 'content': 0.028218790888786316, 'timestamp': '2025-10-01 04:11:18.612077', 'step': 373, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:18.652703', 'step': 373, 'epoch': 1} {'type': 'loss', 'content': 0.03160586208105087, 'timestamp': '2025-10-01 04:11:18.658428', 'step': 374, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:18.697420', 'step': 374, 'epoch': 1} {'type': 'loss', 'content': 0.018323153257369995, 'timestamp': '2025-10-01 04:11:18.703450', 'step': 375, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:18.742862', 'step': 375, 'epoch': 1} {'type': 'loss', 'content': 0.020284952595829964, 'timestamp': '2025-10-01 04:11:18.768866', 'step': 376, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:18.804798', 'step': 376, 'epoch': 1} {'type': 'loss', 'content': 0.012562897987663746, 'timestamp': '2025-10-01 04:11:18.809945', 'step': 377, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:18.855185', 'step': 377, 'epoch': 1} {'type': 'loss', 'content': 0.019718678668141365, 'timestamp': '2025-10-01 04:11:18.861059', 'step': 378, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:18.900339', 'step': 378, 'epoch': 1} {'type': 'loss', 'content': 0.01936933770775795, 'timestamp': '2025-10-01 04:11:18.905467', 'step': 379, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:18.951842', 'step': 379, 'epoch': 1} {'type': 'loss', 'content': 0.022852379828691483, 'timestamp': '2025-10-01 04:11:18.978112', 'step': 380, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:19.023384', 'step': 380, 'epoch': 1} {'type': 'loss', 'content': 0.026843106374144554, 'timestamp': '2025-10-01 04:11:19.031828', 'step': 381, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:19.076923', 'step': 381, 'epoch': 1} {'type': 'loss', 'content': 0.019951580092310905, 'timestamp': '2025-10-01 04:11:19.086965', 'step': 382, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:19.132695', 'step': 382, 'epoch': 1} {'type': 'loss', 'content': 0.012562128715217113, 'timestamp': '2025-10-01 04:11:19.140474', 'step': 383, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:19.181383', 'step': 383, 'epoch': 1} {'type': 'loss', 'content': 0.020456364378333092, 'timestamp': '2025-10-01 04:11:19.210285', 'step': 384, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:19.248051', 'step': 384, 'epoch': 1} {'type': 'loss', 'content': 0.015333155170083046, 'timestamp': '2025-10-01 04:11:19.253166', 'step': 385, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:19.295231', 'step': 385, 'epoch': 1} {'type': 'loss', 'content': 0.018947962671518326, 'timestamp': '2025-10-01 04:11:19.301013', 'step': 386, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:19.337554', 'step': 386, 'epoch': 1} {'type': 'loss', 'content': 0.018343036994338036, 'timestamp': '2025-10-01 04:11:19.342893', 'step': 387, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:19.383521', 'step': 387, 'epoch': 1} {'type': 'loss', 'content': 0.01990184746682644, 'timestamp': '2025-10-01 04:11:19.411725', 'step': 388, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:19.457528', 'step': 388, 'epoch': 1} {'type': 'loss', 'content': 0.01581694185733795, 'timestamp': '2025-10-01 04:11:19.464236', 'step': 389, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:19.508554', 'step': 389, 'epoch': 1} {'type': 'loss', 'content': 0.02895066700875759, 'timestamp': '2025-10-01 04:11:19.520622', 'step': 390, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:19.563260', 'step': 390, 'epoch': 1} {'type': 'loss', 'content': 0.034577902406454086, 'timestamp': '2025-10-01 04:11:19.570058', 'step': 391, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:19.620479', 'step': 391, 'epoch': 1} {'type': 'loss', 'content': 0.01622415892779827, 'timestamp': '2025-10-01 04:11:19.648093', 'step': 392, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:19.688188', 'step': 392, 'epoch': 1} {'type': 'loss', 'content': 0.017028475180268288, 'timestamp': '2025-10-01 04:11:19.692937', 'step': 393, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:11:19.732699', 'step': 393, 'epoch': 1} {'type': 'loss', 'content': 0.021526996046304703, 'timestamp': '2025-10-01 04:11:19.738216', 'step': 394, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:19.781318', 'step': 394, 'epoch': 1} {'type': 'loss', 'content': 0.011884956620633602, 'timestamp': '2025-10-01 04:11:19.785612', 'step': 395, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:19.825853', 'step': 395, 'epoch': 1} {'type': 'loss', 'content': 0.023782964795827866, 'timestamp': '2025-10-01 04:11:19.855267', 'step': 396, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:19.904400', 'step': 396, 'epoch': 1} {'type': 'loss', 'content': 0.011757438071072102, 'timestamp': '2025-10-01 04:11:19.915780', 'step': 397, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:19.963058', 'step': 397, 'epoch': 1} {'type': 'loss', 'content': 0.01555390004068613, 'timestamp': '2025-10-01 04:11:19.969003', 'step': 398, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:20.017948', 'step': 398, 'epoch': 1} {'type': 'loss', 'content': 0.023047687485814095, 'timestamp': '2025-10-01 04:11:20.020359', 'step': 399, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:11:21.209850', 'step': 399, 'epoch': 1} {'type': 'pplx', 'content': 122024603.38308345, 'timestamp': '2025-10-01 04:11:21.217118', 'step': 399, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:21.254869', 'step': 399, 'epoch': 1} {'type': 'loss', 'content': 0.01790771260857582, 'timestamp': '2025-10-01 04:11:21.284616', 'step': 400, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:21.336133', 'step': 400, 'epoch': 1} {'type': 'loss', 'content': 0.011960442177951336, 'timestamp': '2025-10-01 04:11:21.344495', 'step': 401, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:21.395634', 'step': 401, 'epoch': 1} {'type': 'loss', 'content': 0.01990598626434803, 'timestamp': '2025-10-01 04:11:21.406115', 'step': 402, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:21.452092', 'step': 402, 'epoch': 1} {'type': 'loss', 'content': 0.020694376900792122, 'timestamp': '2025-10-01 04:11:21.458386', 'step': 403, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:21.512984', 'step': 403, 'epoch': 1} {'type': 'loss', 'content': 0.029738137498497963, 'timestamp': '2025-10-01 04:11:21.544812', 'step': 404, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:21.594232', 'step': 404, 'epoch': 1} {'type': 'loss', 'content': 0.027073880657553673, 'timestamp': '2025-10-01 04:11:21.602336', 'step': 405, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:21.653692', 'step': 405, 'epoch': 1} {'type': 'loss', 'content': 0.03174943849444389, 'timestamp': '2025-10-01 04:11:21.663507', 'step': 406, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:21.714786', 'step': 406, 'epoch': 1} {'type': 'loss', 'content': 0.014953884296119213, 'timestamp': '2025-10-01 04:11:21.722188', 'step': 407, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:21.771943', 'step': 407, 'epoch': 1} {'type': 'loss', 'content': 0.012122013606131077, 'timestamp': '2025-10-01 04:11:21.805997', 'step': 408, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:21.844584', 'step': 408, 'epoch': 1} {'type': 'loss', 'content': 0.020316915586590767, 'timestamp': '2025-10-01 04:11:21.854618', 'step': 409, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:21.907373', 'step': 409, 'epoch': 1} {'type': 'loss', 'content': 0.021981576457619667, 'timestamp': '2025-10-01 04:11:21.918327', 'step': 410, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:21.966125', 'step': 410, 'epoch': 1} {'type': 'loss', 'content': 0.018987832590937614, 'timestamp': '2025-10-01 04:11:21.973583', 'step': 411, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:22.021265', 'step': 411, 'epoch': 1} {'type': 'loss', 'content': 0.031161749735474586, 'timestamp': '2025-10-01 04:11:22.046053', 'step': 412, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:22.096071', 'step': 412, 'epoch': 1} {'type': 'loss', 'content': 0.015418009832501411, 'timestamp': '2025-10-01 04:11:22.103458', 'step': 413, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:22.155537', 'step': 413, 'epoch': 1} {'type': 'loss', 'content': 0.030573803931474686, 'timestamp': '2025-10-01 04:11:22.163064', 'step': 414, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:22.207582', 'step': 414, 'epoch': 1} {'type': 'loss', 'content': 0.03835904970765114, 'timestamp': '2025-10-01 04:11:22.214062', 'step': 415, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:22.261659', 'step': 415, 'epoch': 1} {'type': 'loss', 'content': 0.01001194678246975, 'timestamp': '2025-10-01 04:11:22.292483', 'step': 416, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:22.342012', 'step': 416, 'epoch': 1} {'type': 'loss', 'content': 0.025580940768122673, 'timestamp': '2025-10-01 04:11:22.351194', 'step': 417, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:22.397964', 'step': 417, 'epoch': 1} {'type': 'loss', 'content': 0.02997286431491375, 'timestamp': '2025-10-01 04:11:22.408030', 'step': 418, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:22.463339', 'step': 418, 'epoch': 1} {'type': 'loss', 'content': 0.021901780739426613, 'timestamp': '2025-10-01 04:11:22.469058', 'step': 419, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:22.513040', 'step': 419, 'epoch': 1} {'type': 'loss', 'content': 0.016856132075190544, 'timestamp': '2025-10-01 04:11:22.541490', 'step': 420, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:22.588021', 'step': 420, 'epoch': 1} {'type': 'loss', 'content': 0.011998930014669895, 'timestamp': '2025-10-01 04:11:22.599767', 'step': 421, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:22.664805', 'step': 421, 'epoch': 1} {'type': 'loss', 'content': 0.011346760205924511, 'timestamp': '2025-10-01 04:11:22.673954', 'step': 422, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:22.723750', 'step': 422, 'epoch': 1} {'type': 'loss', 'content': 0.01255186740309, 'timestamp': '2025-10-01 04:11:22.726544', 'step': 423, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:22.773621', 'step': 423, 'epoch': 1} {'type': 'loss', 'content': 0.021336868405342102, 'timestamp': '2025-10-01 04:11:22.804610', 'step': 424, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:22.859929', 'step': 424, 'epoch': 1} {'type': 'loss', 'content': 0.0202977005392313, 'timestamp': '2025-10-01 04:11:22.871066', 'step': 425, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:22.924978', 'step': 425, 'epoch': 1} {'type': 'loss', 'content': 0.030498603358864784, 'timestamp': '2025-10-01 04:11:22.928692', 'step': 426, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:22.976022', 'step': 426, 'epoch': 1} {'type': 'loss', 'content': 0.02014249376952648, 'timestamp': '2025-10-01 04:11:22.987075', 'step': 427, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:23.040232', 'step': 427, 'epoch': 1} {'type': 'loss', 'content': 0.02097696252167225, 'timestamp': '2025-10-01 04:11:23.072232', 'step': 428, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:23.123792', 'step': 428, 'epoch': 1} {'type': 'loss', 'content': 0.038673240691423416, 'timestamp': '2025-10-01 04:11:23.127105', 'step': 429, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:23.174343', 'step': 429, 'epoch': 1} {'type': 'loss', 'content': 0.01677296869456768, 'timestamp': '2025-10-01 04:11:23.182653', 'step': 430, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:23.231806', 'step': 430, 'epoch': 1} {'type': 'loss', 'content': 0.012115752324461937, 'timestamp': '2025-10-01 04:11:23.241274', 'step': 431, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:23.289502', 'step': 431, 'epoch': 1} {'type': 'loss', 'content': 0.019276294857263565, 'timestamp': '2025-10-01 04:11:23.318581', 'step': 432, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:23.358622', 'step': 432, 'epoch': 1} {'type': 'loss', 'content': 0.024136707186698914, 'timestamp': '2025-10-01 04:11:23.368852', 'step': 433, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:11:23.421813', 'step': 433, 'epoch': 1} {'type': 'loss', 'content': 0.02334047667682171, 'timestamp': '2025-10-01 04:11:23.427072', 'step': 434, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:23.472575', 'step': 434, 'epoch': 1} {'type': 'loss', 'content': 0.020124908536672592, 'timestamp': '2025-10-01 04:11:23.482096', 'step': 435, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:23.535856', 'step': 435, 'epoch': 1} {'type': 'loss', 'content': 0.018802836537361145, 'timestamp': '2025-10-01 04:11:23.568582', 'step': 436, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:23.618627', 'step': 436, 'epoch': 1} {'type': 'loss', 'content': 0.023524196818470955, 'timestamp': '2025-10-01 04:11:23.622349', 'step': 437, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:23.657160', 'step': 437, 'epoch': 1} {'type': 'loss', 'content': 0.023192165419459343, 'timestamp': '2025-10-01 04:11:23.665523', 'step': 438, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:23.707024', 'step': 438, 'epoch': 1} {'type': 'loss', 'content': 0.03054480254650116, 'timestamp': '2025-10-01 04:11:23.713306', 'step': 439, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:23.762565', 'step': 439, 'epoch': 1} {'type': 'loss', 'content': 0.021388614550232887, 'timestamp': '2025-10-01 04:11:23.790395', 'step': 440, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:23.835924', 'step': 440, 'epoch': 1} {'type': 'loss', 'content': 0.029041411355137825, 'timestamp': '2025-10-01 04:11:23.843332', 'step': 441, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:23.889330', 'step': 441, 'epoch': 1} {'type': 'loss', 'content': 0.027110761031508446, 'timestamp': '2025-10-01 04:11:23.896453', 'step': 442, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:23.940065', 'step': 442, 'epoch': 1} {'type': 'loss', 'content': 0.0177504513412714, 'timestamp': '2025-10-01 04:11:23.947262', 'step': 443, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:23.996658', 'step': 443, 'epoch': 1} {'type': 'loss', 'content': 0.02896556630730629, 'timestamp': '2025-10-01 04:11:24.025450', 'step': 444, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:24.069273', 'step': 444, 'epoch': 1} {'type': 'loss', 'content': 0.02039783075451851, 'timestamp': '2025-10-01 04:11:24.075484', 'step': 445, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:24.120500', 'step': 445, 'epoch': 1} {'type': 'loss', 'content': 0.021455178037285805, 'timestamp': '2025-10-01 04:11:24.123772', 'step': 446, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:24.160550', 'step': 446, 'epoch': 1} {'type': 'loss', 'content': 0.01578565128147602, 'timestamp': '2025-10-01 04:11:24.167380', 'step': 447, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:24.213798', 'step': 447, 'epoch': 1} {'type': 'loss', 'content': 0.019582688808441162, 'timestamp': '2025-10-01 04:11:24.246017', 'step': 448, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:24.290111', 'step': 448, 'epoch': 1} {'type': 'loss', 'content': 0.017545174807310104, 'timestamp': '2025-10-01 04:11:24.297889', 'step': 449, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:24.338911', 'step': 449, 'epoch': 1} {'type': 'loss', 'content': 0.02778690680861473, 'timestamp': '2025-10-01 04:11:24.345652', 'step': 450, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:24.392748', 'step': 450, 'epoch': 1} {'type': 'loss', 'content': 0.020310161635279655, 'timestamp': '2025-10-01 04:11:24.399949', 'step': 451, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:24.443342', 'step': 451, 'epoch': 1} {'type': 'loss', 'content': 0.01537756435573101, 'timestamp': '2025-10-01 04:11:24.474569', 'step': 452, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:24.523173', 'step': 452, 'epoch': 1} {'type': 'loss', 'content': 0.02046174742281437, 'timestamp': '2025-10-01 04:11:24.528769', 'step': 453, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:11:24.576663', 'step': 453, 'epoch': 1} {'type': 'loss', 'content': 0.02453591488301754, 'timestamp': '2025-10-01 04:11:24.584379', 'step': 454, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:24.628219', 'step': 454, 'epoch': 1} {'type': 'loss', 'content': 0.021878505125641823, 'timestamp': '2025-10-01 04:11:24.631028', 'step': 455, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:24.678347', 'step': 455, 'epoch': 1} {'type': 'loss', 'content': 0.018454913049936295, 'timestamp': '2025-10-01 04:11:24.711161', 'step': 456, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:11:26.221328', 'step': 456, 'epoch': 1} {'type': 'pplx', 'content': 130055133.76006038, 'timestamp': '2025-10-01 04:11:26.230991', 'step': 456, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:26.272793', 'step': 456, 'epoch': 1} {'type': 'loss', 'content': 0.022608721628785133, 'timestamp': '2025-10-01 04:11:26.282629', 'step': 457, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:26.335087', 'step': 457, 'epoch': 1} {'type': 'loss', 'content': 0.0280974842607975, 'timestamp': '2025-10-01 04:11:26.345856', 'step': 458, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:26.407223', 'step': 458, 'epoch': 1} {'type': 'loss', 'content': 0.017616761848330498, 'timestamp': '2025-10-01 04:11:26.420275', 'step': 459, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:26.479397', 'step': 459, 'epoch': 1} {'type': 'loss', 'content': 0.0306308064609766, 'timestamp': '2025-10-01 04:11:26.511368', 'step': 460, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:26.562366', 'step': 460, 'epoch': 1} {'type': 'loss', 'content': 0.02061159536242485, 'timestamp': '2025-10-01 04:11:26.573676', 'step': 461, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:26.628336', 'step': 461, 'epoch': 1} {'type': 'loss', 'content': 0.015530804172158241, 'timestamp': '2025-10-01 04:11:26.631710', 'step': 462, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:26.679849', 'step': 462, 'epoch': 1} {'type': 'loss', 'content': 0.011279831640422344, 'timestamp': '2025-10-01 04:11:26.690186', 'step': 463, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:26.743968', 'step': 463, 'epoch': 1} {'type': 'loss', 'content': 0.019206982105970383, 'timestamp': '2025-10-01 04:11:26.768849', 'step': 464, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:26.812608', 'step': 464, 'epoch': 1} {'type': 'loss', 'content': 0.021167373284697533, 'timestamp': '2025-10-01 04:11:26.817101', 'step': 465, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:26.876645', 'step': 465, 'epoch': 1} {'type': 'loss', 'content': 0.017267005518078804, 'timestamp': '2025-10-01 04:11:26.891235', 'step': 466, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:26.951176', 'step': 466, 'epoch': 1} {'type': 'loss', 'content': 0.024158697575330734, 'timestamp': '2025-10-01 04:11:26.962773', 'step': 467, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:27.020341', 'step': 467, 'epoch': 1} {'type': 'loss', 'content': 0.02584967017173767, 'timestamp': '2025-10-01 04:11:27.053211', 'step': 468, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:27.107640', 'step': 468, 'epoch': 1} {'type': 'loss', 'content': 0.02439996600151062, 'timestamp': '2025-10-01 04:11:27.118767', 'step': 469, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:27.176595', 'step': 469, 'epoch': 1} {'type': 'loss', 'content': 0.029676523059606552, 'timestamp': '2025-10-01 04:11:27.191014', 'step': 470, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:27.261187', 'step': 470, 'epoch': 1} {'type': 'loss', 'content': 0.011198568157851696, 'timestamp': '2025-10-01 04:11:27.266160', 'step': 471, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:27.317368', 'step': 471, 'epoch': 1} {'type': 'loss', 'content': 0.018555212765932083, 'timestamp': '2025-10-01 04:11:27.353277', 'step': 472, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:27.412642', 'step': 472, 'epoch': 1} {'type': 'loss', 'content': 0.01946217007935047, 'timestamp': '2025-10-01 04:11:27.423749', 'step': 473, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:27.482034', 'step': 473, 'epoch': 1} {'type': 'loss', 'content': 0.02705521322786808, 'timestamp': '2025-10-01 04:11:27.493430', 'step': 474, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:27.545635', 'step': 474, 'epoch': 1} {'type': 'loss', 'content': 0.015881655737757683, 'timestamp': '2025-10-01 04:11:27.551250', 'step': 475, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:27.602855', 'step': 475, 'epoch': 1} {'type': 'loss', 'content': 0.015143660828471184, 'timestamp': '2025-10-01 04:11:27.636273', 'step': 476, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:27.689037', 'step': 476, 'epoch': 1} {'type': 'loss', 'content': 0.016437551006674767, 'timestamp': '2025-10-01 04:11:27.702284', 'step': 477, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:27.766160', 'step': 477, 'epoch': 1} {'type': 'loss', 'content': 0.024245990440249443, 'timestamp': '2025-10-01 04:11:27.778044', 'step': 478, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:27.841867', 'step': 478, 'epoch': 1} {'type': 'loss', 'content': 0.015856830403208733, 'timestamp': '2025-10-01 04:11:27.854357', 'step': 479, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:27.918230', 'step': 479, 'epoch': 1} {'type': 'loss', 'content': 0.006872235331684351, 'timestamp': '2025-10-01 04:11:27.952875', 'step': 480, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:28.010340', 'step': 480, 'epoch': 1} {'type': 'loss', 'content': 0.01868906430900097, 'timestamp': '2025-10-01 04:11:28.023917', 'step': 481, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:28.087688', 'step': 481, 'epoch': 1} {'type': 'loss', 'content': 0.0282533448189497, 'timestamp': '2025-10-01 04:11:28.097800', 'step': 482, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:28.149950', 'step': 482, 'epoch': 1} {'type': 'loss', 'content': 0.017546121031045914, 'timestamp': '2025-10-01 04:11:28.159246', 'step': 483, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:28.204649', 'step': 483, 'epoch': 1} {'type': 'loss', 'content': 0.010689259506762028, 'timestamp': '2025-10-01 04:11:28.236264', 'step': 484, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:28.286274', 'step': 484, 'epoch': 1} {'type': 'loss', 'content': 0.017749961465597153, 'timestamp': '2025-10-01 04:11:28.296031', 'step': 485, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:28.349267', 'step': 485, 'epoch': 1} {'type': 'loss', 'content': 0.019139355048537254, 'timestamp': '2025-10-01 04:11:28.357242', 'step': 486, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:28.409349', 'step': 486, 'epoch': 1} {'type': 'loss', 'content': 0.035080693662166595, 'timestamp': '2025-10-01 04:11:28.416474', 'step': 487, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:28.467784', 'step': 487, 'epoch': 1} {'type': 'loss', 'content': 0.02607973851263523, 'timestamp': '2025-10-01 04:11:28.498015', 'step': 488, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:28.553625', 'step': 488, 'epoch': 1} {'type': 'loss', 'content': 0.01762440800666809, 'timestamp': '2025-10-01 04:11:28.565257', 'step': 489, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:28.618742', 'step': 489, 'epoch': 1} {'type': 'loss', 'content': 0.015824036672711372, 'timestamp': '2025-10-01 04:11:28.629979', 'step': 490, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:28.676386', 'step': 490, 'epoch': 1} {'type': 'loss', 'content': 0.0184563510119915, 'timestamp': '2025-10-01 04:11:28.687589', 'step': 491, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:28.739797', 'step': 491, 'epoch': 1} {'type': 'loss', 'content': 0.014332951046526432, 'timestamp': '2025-10-01 04:11:28.771525', 'step': 492, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:28.823705', 'step': 492, 'epoch': 1} {'type': 'loss', 'content': 0.013719163835048676, 'timestamp': '2025-10-01 04:11:28.831465', 'step': 493, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:28.889022', 'step': 493, 'epoch': 1} {'type': 'loss', 'content': 0.03325566649436951, 'timestamp': '2025-10-01 04:11:28.892931', 'step': 494, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:28.927265', 'step': 494, 'epoch': 1} {'type': 'loss', 'content': 0.021148711442947388, 'timestamp': '2025-10-01 04:11:28.931412', 'step': 495, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:28.974937', 'step': 495, 'epoch': 1} {'type': 'loss', 'content': 0.02531592920422554, 'timestamp': '2025-10-01 04:11:29.005500', 'step': 496, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:29.054608', 'step': 496, 'epoch': 1} {'type': 'loss', 'content': 0.01008320041000843, 'timestamp': '2025-10-01 04:11:29.063165', 'step': 497, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:29.109695', 'step': 497, 'epoch': 1} {'type': 'loss', 'content': 0.021790364757180214, 'timestamp': '2025-10-01 04:11:29.118405', 'step': 498, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:29.174090', 'step': 498, 'epoch': 1} {'type': 'loss', 'content': 0.02201670967042446, 'timestamp': '2025-10-01 04:11:29.182176', 'step': 499, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:29.229338', 'step': 499, 'epoch': 1} {'type': 'loss', 'content': 0.013443304225802422, 'timestamp': '2025-10-01 04:11:29.264043', 'step': 500, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 500', 'timestamp': '2025-10-01 04:11:34.173273', 'step': 500, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:34.231112', 'step': 500, 'epoch': 1} {'type': 'loss', 'content': 0.012881094589829445, 'timestamp': '2025-10-01 04:11:34.244068', 'step': 501, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:34.306018', 'step': 501, 'epoch': 1} {'type': 'loss', 'content': 0.026178503409028053, 'timestamp': '2025-10-01 04:11:34.311623', 'step': 502, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:34.363636', 'step': 502, 'epoch': 1} {'type': 'loss', 'content': 0.02012251317501068, 'timestamp': '2025-10-01 04:11:34.372496', 'step': 503, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:34.424270', 'step': 503, 'epoch': 1} {'type': 'loss', 'content': 0.022417036816477776, 'timestamp': '2025-10-01 04:11:34.455470', 'step': 504, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:34.500142', 'step': 504, 'epoch': 1} {'type': 'loss', 'content': 0.0069273607805371284, 'timestamp': '2025-10-01 04:11:34.509824', 'step': 505, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:34.562129', 'step': 505, 'epoch': 1} {'type': 'loss', 'content': 0.03598469868302345, 'timestamp': '2025-10-01 04:11:34.570327', 'step': 506, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:34.619445', 'step': 506, 'epoch': 1} {'type': 'loss', 'content': 0.02659173123538494, 'timestamp': '2025-10-01 04:11:34.623475', 'step': 507, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:34.664081', 'step': 507, 'epoch': 1} {'type': 'loss', 'content': 0.02742273546755314, 'timestamp': '2025-10-01 04:11:34.695844', 'step': 508, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:34.735857', 'step': 508, 'epoch': 1} {'type': 'loss', 'content': 0.014712974429130554, 'timestamp': '2025-10-01 04:11:34.739445', 'step': 509, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:34.782424', 'step': 509, 'epoch': 1} {'type': 'loss', 'content': 0.012078308500349522, 'timestamp': '2025-10-01 04:11:34.786894', 'step': 510, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:34.827497', 'step': 510, 'epoch': 1} {'type': 'loss', 'content': 0.028882889077067375, 'timestamp': '2025-10-01 04:11:34.832141', 'step': 511, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:34.884158', 'step': 511, 'epoch': 1} {'type': 'loss', 'content': 0.010723995044827461, 'timestamp': '2025-10-01 04:11:34.916416', 'step': 512, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:34.968133', 'step': 512, 'epoch': 1} {'type': 'loss', 'content': 0.03016284666955471, 'timestamp': '2025-10-01 04:11:34.976869', 'step': 513, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:11:36.604281', 'step': 513, 'epoch': 1} {'type': 'pplx', 'content': 144971613.22876847, 'timestamp': '2025-10-01 04:11:36.616119', 'step': 513, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:36.663179', 'step': 513, 'epoch': 1} {'type': 'loss', 'content': 0.009461253881454468, 'timestamp': '2025-10-01 04:11:36.674355', 'step': 514, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:36.725412', 'step': 514, 'epoch': 1} {'type': 'loss', 'content': 0.0260606799274683, 'timestamp': '2025-10-01 04:11:36.737766', 'step': 515, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:36.792608', 'step': 515, 'epoch': 1} {'type': 'loss', 'content': 0.011063075624406338, 'timestamp': '2025-10-01 04:11:36.820150', 'step': 516, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:36.866006', 'step': 516, 'epoch': 1} {'type': 'loss', 'content': 0.02779548428952694, 'timestamp': '2025-10-01 04:11:36.875604', 'step': 517, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:36.933550', 'step': 517, 'epoch': 1} {'type': 'loss', 'content': 0.02797873318195343, 'timestamp': '2025-10-01 04:11:36.942350', 'step': 518, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:36.986118', 'step': 518, 'epoch': 1} {'type': 'loss', 'content': 0.030544089153409004, 'timestamp': '2025-10-01 04:11:36.990498', 'step': 519, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:37.032133', 'step': 519, 'epoch': 1} {'type': 'loss', 'content': 0.01135441567748785, 'timestamp': '2025-10-01 04:11:37.062897', 'step': 520, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:37.109278', 'step': 520, 'epoch': 1} {'type': 'loss', 'content': 0.02531917579472065, 'timestamp': '2025-10-01 04:11:37.113661', 'step': 521, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:37.159294', 'step': 521, 'epoch': 1} {'type': 'loss', 'content': 0.030459651723504066, 'timestamp': '2025-10-01 04:11:37.163379', 'step': 522, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:37.202923', 'step': 522, 'epoch': 1} {'type': 'loss', 'content': 0.012750692665576935, 'timestamp': '2025-10-01 04:11:37.206857', 'step': 523, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:37.258203', 'step': 523, 'epoch': 1} {'type': 'loss', 'content': 0.005090842954814434, 'timestamp': '2025-10-01 04:11:37.290681', 'step': 524, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:37.338819', 'step': 524, 'epoch': 1} {'type': 'loss', 'content': 0.013615354895591736, 'timestamp': '2025-10-01 04:11:37.352708', 'step': 525, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:37.404254', 'step': 525, 'epoch': 1} {'type': 'loss', 'content': 0.012541785836219788, 'timestamp': '2025-10-01 04:11:37.415637', 'step': 526, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:37.465835', 'step': 526, 'epoch': 1} {'type': 'loss', 'content': 0.013080544769763947, 'timestamp': '2025-10-01 04:11:37.477060', 'step': 527, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:37.526447', 'step': 527, 'epoch': 1} {'type': 'loss', 'content': 0.015069060027599335, 'timestamp': '2025-10-01 04:11:37.558216', 'step': 528, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:37.611158', 'step': 528, 'epoch': 1} {'type': 'loss', 'content': 0.016289498656988144, 'timestamp': '2025-10-01 04:11:37.615975', 'step': 529, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:37.668954', 'step': 529, 'epoch': 1} {'type': 'loss', 'content': 0.013955731876194477, 'timestamp': '2025-10-01 04:11:37.677874', 'step': 530, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:37.729797', 'step': 530, 'epoch': 1} {'type': 'loss', 'content': 0.025528011843562126, 'timestamp': '2025-10-01 04:11:37.742663', 'step': 531, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:37.785845', 'step': 531, 'epoch': 1} {'type': 'loss', 'content': 0.01630363240838051, 'timestamp': '2025-10-01 04:11:37.817789', 'step': 532, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:37.862583', 'step': 532, 'epoch': 1} {'type': 'loss', 'content': 0.025512894615530968, 'timestamp': '2025-10-01 04:11:37.865298', 'step': 533, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:37.918389', 'step': 533, 'epoch': 1} {'type': 'loss', 'content': 0.030232040211558342, 'timestamp': '2025-10-01 04:11:37.927858', 'step': 534, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:37.968421', 'step': 534, 'epoch': 1} {'type': 'loss', 'content': 0.026811758056282997, 'timestamp': '2025-10-01 04:11:37.977836', 'step': 535, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:38.028632', 'step': 535, 'epoch': 1} {'type': 'loss', 'content': 0.011201995424926281, 'timestamp': '2025-10-01 04:11:38.053083', 'step': 536, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:38.092867', 'step': 536, 'epoch': 1} {'type': 'loss', 'content': 0.03314945474267006, 'timestamp': '2025-10-01 04:11:38.099522', 'step': 537, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:38.142433', 'step': 537, 'epoch': 1} {'type': 'loss', 'content': 0.014284268952906132, 'timestamp': '2025-10-01 04:11:38.145838', 'step': 538, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:38.186400', 'step': 538, 'epoch': 1} {'type': 'loss', 'content': 0.04397100955247879, 'timestamp': '2025-10-01 04:11:38.193458', 'step': 539, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:38.237954', 'step': 539, 'epoch': 1} {'type': 'loss', 'content': 0.021980592980980873, 'timestamp': '2025-10-01 04:11:38.265054', 'step': 540, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:38.311808', 'step': 540, 'epoch': 1} {'type': 'loss', 'content': 0.03475477546453476, 'timestamp': '2025-10-01 04:11:38.320167', 'step': 541, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:38.359102', 'step': 541, 'epoch': 1} {'type': 'loss', 'content': 0.019862128421664238, 'timestamp': '2025-10-01 04:11:38.368213', 'step': 542, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:38.406858', 'step': 542, 'epoch': 1} {'type': 'loss', 'content': 0.01110259909182787, 'timestamp': '2025-10-01 04:11:38.409661', 'step': 543, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:38.448788', 'step': 543, 'epoch': 1} {'type': 'loss', 'content': 0.015918336808681488, 'timestamp': '2025-10-01 04:11:38.474360', 'step': 544, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:38.512986', 'step': 544, 'epoch': 1} {'type': 'loss', 'content': 0.02247166447341442, 'timestamp': '2025-10-01 04:11:38.516924', 'step': 545, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:38.553224', 'step': 545, 'epoch': 1} {'type': 'loss', 'content': 0.02808975800871849, 'timestamp': '2025-10-01 04:11:38.557013', 'step': 546, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:38.592106', 'step': 546, 'epoch': 1} {'type': 'loss', 'content': 0.03257390111684799, 'timestamp': '2025-10-01 04:11:38.595029', 'step': 547, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:38.631955', 'step': 547, 'epoch': 1} {'type': 'loss', 'content': 0.020895110443234444, 'timestamp': '2025-10-01 04:11:38.657815', 'step': 548, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:38.692790', 'step': 548, 'epoch': 1} {'type': 'loss', 'content': 0.03609849140048027, 'timestamp': '2025-10-01 04:11:38.698190', 'step': 549, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:38.734437', 'step': 549, 'epoch': 1} {'type': 'loss', 'content': 0.03249197080731392, 'timestamp': '2025-10-01 04:11:38.738953', 'step': 550, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:38.775635', 'step': 550, 'epoch': 1} {'type': 'loss', 'content': 0.02835281565785408, 'timestamp': '2025-10-01 04:11:38.782143', 'step': 551, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:38.827829', 'step': 551, 'epoch': 1} {'type': 'loss', 'content': 0.03946463018655777, 'timestamp': '2025-10-01 04:11:38.856539', 'step': 552, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:38.915006', 'step': 552, 'epoch': 1} {'type': 'loss', 'content': 0.020388005301356316, 'timestamp': '2025-10-01 04:11:38.919534', 'step': 553, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:38.965187', 'step': 553, 'epoch': 1} {'type': 'loss', 'content': 0.02258807048201561, 'timestamp': '2025-10-01 04:11:38.975571', 'step': 554, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:39.035104', 'step': 554, 'epoch': 1} {'type': 'loss', 'content': 0.01330896932631731, 'timestamp': '2025-10-01 04:11:39.041654', 'step': 555, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:39.097170', 'step': 555, 'epoch': 1} {'type': 'loss', 'content': 0.026032237336039543, 'timestamp': '2025-10-01 04:11:39.122580', 'step': 556, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:39.173247', 'step': 556, 'epoch': 1} {'type': 'loss', 'content': 0.023419994860887527, 'timestamp': '2025-10-01 04:11:39.184804', 'step': 557, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:39.232931', 'step': 557, 'epoch': 1} {'type': 'loss', 'content': 0.016865305602550507, 'timestamp': '2025-10-01 04:11:39.240914', 'step': 558, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:39.283719', 'step': 558, 'epoch': 1} {'type': 'loss', 'content': 0.018437618389725685, 'timestamp': '2025-10-01 04:11:39.294422', 'step': 559, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:39.347342', 'step': 559, 'epoch': 1} {'type': 'loss', 'content': 0.01190783642232418, 'timestamp': '2025-10-01 04:11:39.375401', 'step': 560, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:39.424333', 'step': 560, 'epoch': 1} {'type': 'loss', 'content': 0.01879892125725746, 'timestamp': '2025-10-01 04:11:39.427746', 'step': 561, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:39.490886', 'step': 561, 'epoch': 1} {'type': 'loss', 'content': 0.014638060703873634, 'timestamp': '2025-10-01 04:11:39.503826', 'step': 562, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:39.559223', 'step': 562, 'epoch': 1} {'type': 'loss', 'content': 0.015171224251389503, 'timestamp': '2025-10-01 04:11:39.571749', 'step': 563, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:39.615752', 'step': 563, 'epoch': 1} {'type': 'loss', 'content': 0.030538225546479225, 'timestamp': '2025-10-01 04:11:39.644445', 'step': 564, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:39.697071', 'step': 564, 'epoch': 1} {'type': 'loss', 'content': 0.02481764741241932, 'timestamp': '2025-10-01 04:11:39.699689', 'step': 565, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:39.741554', 'step': 565, 'epoch': 1} {'type': 'loss', 'content': 0.021298576146364212, 'timestamp': '2025-10-01 04:11:39.746909', 'step': 566, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:39.788689', 'step': 566, 'epoch': 1} {'type': 'loss', 'content': 0.015441848896443844, 'timestamp': '2025-10-01 04:11:39.794697', 'step': 567, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:11:39.833843', 'step': 567, 'epoch': 1} {'type': 'loss', 'content': 0.023940859362483025, 'timestamp': '2025-10-01 04:11:39.861606', 'step': 568, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:39.902005', 'step': 568, 'epoch': 1} {'type': 'loss', 'content': 0.030663734301924706, 'timestamp': '2025-10-01 04:11:39.908904', 'step': 569, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:39.952564', 'step': 569, 'epoch': 1} {'type': 'loss', 'content': 0.01633959636092186, 'timestamp': '2025-10-01 04:11:39.958638', 'step': 570, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:11:41.224592', 'step': 570, 'epoch': 1} {'type': 'pplx', 'content': 146728730.95887718, 'timestamp': '2025-10-01 04:11:41.228038', 'step': 570, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:41.259141', 'step': 570, 'epoch': 1} {'type': 'loss', 'content': 0.014267205260694027, 'timestamp': '2025-10-01 04:11:41.263819', 'step': 571, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:41.300716', 'step': 571, 'epoch': 1} {'type': 'loss', 'content': 0.017848124727606773, 'timestamp': '2025-10-01 04:11:41.331862', 'step': 572, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:41.385110', 'step': 572, 'epoch': 1} {'type': 'loss', 'content': 0.029249539598822594, 'timestamp': '2025-10-01 04:11:41.392861', 'step': 573, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:41.446810', 'step': 573, 'epoch': 1} {'type': 'loss', 'content': 0.019859248772263527, 'timestamp': '2025-10-01 04:11:41.450320', 'step': 574, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:41.498695', 'step': 574, 'epoch': 1} {'type': 'loss', 'content': 0.014449072070419788, 'timestamp': '2025-10-01 04:11:41.506288', 'step': 575, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:41.556632', 'step': 575, 'epoch': 1} {'type': 'loss', 'content': 0.00982545968145132, 'timestamp': '2025-10-01 04:11:41.591449', 'step': 576, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:41.636913', 'step': 576, 'epoch': 1} {'type': 'loss', 'content': 0.018064500764012337, 'timestamp': '2025-10-01 04:11:41.648203', 'step': 577, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:11:41.702802', 'step': 577, 'epoch': 1} {'type': 'loss', 'content': 0.027665166184306145, 'timestamp': '2025-10-01 04:11:41.712337', 'step': 578, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:41.757805', 'step': 578, 'epoch': 1} {'type': 'loss', 'content': 0.013823381625115871, 'timestamp': '2025-10-01 04:11:41.765391', 'step': 579, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:41.811314', 'step': 579, 'epoch': 1} {'type': 'loss', 'content': 0.01081685908138752, 'timestamp': '2025-10-01 04:11:41.842087', 'step': 580, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:41.888817', 'step': 580, 'epoch': 1} {'type': 'loss', 'content': 0.012843619100749493, 'timestamp': '2025-10-01 04:11:41.892572', 'step': 581, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:41.926267', 'step': 581, 'epoch': 1} {'type': 'loss', 'content': 0.004551833029836416, 'timestamp': '2025-10-01 04:11:41.930554', 'step': 582, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:41.978378', 'step': 582, 'epoch': 1} {'type': 'loss', 'content': 0.022612929344177246, 'timestamp': '2025-10-01 04:11:41.989583', 'step': 583, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:42.034695', 'step': 583, 'epoch': 1} {'type': 'loss', 'content': 0.018055619671940804, 'timestamp': '2025-10-01 04:11:42.067852', 'step': 584, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:42.123287', 'step': 584, 'epoch': 1} {'type': 'loss', 'content': 0.0163103099912405, 'timestamp': '2025-10-01 04:11:42.128278', 'step': 585, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:42.173776', 'step': 585, 'epoch': 1} {'type': 'loss', 'content': 0.003515422809869051, 'timestamp': '2025-10-01 04:11:42.177904', 'step': 586, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:42.231181', 'step': 586, 'epoch': 1} {'type': 'loss', 'content': 0.002217690460383892, 'timestamp': '2025-10-01 04:11:42.235150', 'step': 587, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:42.279365', 'step': 587, 'epoch': 1} {'type': 'loss', 'content': 0.048120323568582535, 'timestamp': '2025-10-01 04:11:42.303808', 'step': 588, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:42.344496', 'step': 588, 'epoch': 1} {'type': 'loss', 'content': 0.03937225788831711, 'timestamp': '2025-10-01 04:11:42.350980', 'step': 589, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:42.394800', 'step': 589, 'epoch': 1} {'type': 'loss', 'content': 0.026554737240076065, 'timestamp': '2025-10-01 04:11:42.401055', 'step': 590, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:11:42.456468', 'step': 590, 'epoch': 1} {'type': 'loss', 'content': 0.03893747180700302, 'timestamp': '2025-10-01 04:11:42.461524', 'step': 591, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:42.498399', 'step': 591, 'epoch': 1} {'type': 'loss', 'content': 0.016101589426398277, 'timestamp': '2025-10-01 04:11:42.522332', 'step': 592, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:42.561069', 'step': 592, 'epoch': 1} {'type': 'loss', 'content': 0.024273639544844627, 'timestamp': '2025-10-01 04:11:42.566142', 'step': 593, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:42.603836', 'step': 593, 'epoch': 1} {'type': 'loss', 'content': 0.005769102834165096, 'timestamp': '2025-10-01 04:11:42.606126', 'step': 594, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:42.643810', 'step': 594, 'epoch': 1} {'type': 'loss', 'content': 0.017685530707240105, 'timestamp': '2025-10-01 04:11:42.648496', 'step': 595, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:42.689213', 'step': 595, 'epoch': 1} {'type': 'loss', 'content': 0.03208998963236809, 'timestamp': '2025-10-01 04:11:42.715515', 'step': 596, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:42.759915', 'step': 596, 'epoch': 1} {'type': 'loss', 'content': 0.018433064222335815, 'timestamp': '2025-10-01 04:11:42.767772', 'step': 597, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:42.804217', 'step': 597, 'epoch': 1} {'type': 'loss', 'content': 0.019542574882507324, 'timestamp': '2025-10-01 04:11:42.810714', 'step': 598, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:42.845310', 'step': 598, 'epoch': 1} {'type': 'loss', 'content': 0.019781693816184998, 'timestamp': '2025-10-01 04:11:42.849144', 'step': 599, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:42.887017', 'step': 599, 'epoch': 1} {'type': 'loss', 'content': 0.030181298032402992, 'timestamp': '2025-10-01 04:11:42.918198', 'step': 600, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:42.956952', 'step': 600, 'epoch': 1} {'type': 'loss', 'content': 0.04341671243309975, 'timestamp': '2025-10-01 04:11:42.966349', 'step': 601, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:43.008213', 'step': 601, 'epoch': 1} {'type': 'loss', 'content': 0.019647929817438126, 'timestamp': '2025-10-01 04:11:43.011942', 'step': 602, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:43.061044', 'step': 602, 'epoch': 1} {'type': 'loss', 'content': 0.03288035839796066, 'timestamp': '2025-10-01 04:11:43.068800', 'step': 603, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:43.114934', 'step': 603, 'epoch': 1} {'type': 'loss', 'content': 0.012869777157902718, 'timestamp': '2025-10-01 04:11:43.140205', 'step': 604, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:43.190453', 'step': 604, 'epoch': 1} {'type': 'loss', 'content': 0.014938059262931347, 'timestamp': '2025-10-01 04:11:43.197875', 'step': 605, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:43.243030', 'step': 605, 'epoch': 1} {'type': 'loss', 'content': 0.020758474245667458, 'timestamp': '2025-10-01 04:11:43.254294', 'step': 606, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:43.313300', 'step': 606, 'epoch': 1} {'type': 'loss', 'content': 0.023509101942181587, 'timestamp': '2025-10-01 04:11:43.325382', 'step': 607, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:43.379107', 'step': 607, 'epoch': 1} {'type': 'loss', 'content': 0.012371575459837914, 'timestamp': '2025-10-01 04:11:43.404940', 'step': 608, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:43.457173', 'step': 608, 'epoch': 1} {'type': 'loss', 'content': 0.013531157746911049, 'timestamp': '2025-10-01 04:11:43.460037', 'step': 609, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:43.509484', 'step': 609, 'epoch': 1} {'type': 'loss', 'content': 0.009105498902499676, 'timestamp': '2025-10-01 04:11:43.513600', 'step': 610, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:43.577249', 'step': 610, 'epoch': 1} {'type': 'loss', 'content': 0.029772654175758362, 'timestamp': '2025-10-01 04:11:43.580799', 'step': 611, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:43.625139', 'step': 611, 'epoch': 1} {'type': 'loss', 'content': 0.0325927771627903, 'timestamp': '2025-10-01 04:11:43.655460', 'step': 612, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:43.712267', 'step': 612, 'epoch': 1} {'type': 'loss', 'content': 0.01552466582506895, 'timestamp': '2025-10-01 04:11:43.715922', 'step': 613, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:43.768814', 'step': 613, 'epoch': 1} {'type': 'loss', 'content': 0.019661977887153625, 'timestamp': '2025-10-01 04:11:43.778483', 'step': 614, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:43.824321', 'step': 614, 'epoch': 1} {'type': 'loss', 'content': 0.040737394243478775, 'timestamp': '2025-10-01 04:11:43.834053', 'step': 615, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:43.881033', 'step': 615, 'epoch': 1} {'type': 'loss', 'content': 0.010232476517558098, 'timestamp': '2025-10-01 04:11:43.909598', 'step': 616, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:43.951807', 'step': 616, 'epoch': 1} {'type': 'loss', 'content': 0.030402900651097298, 'timestamp': '2025-10-01 04:11:43.956166', 'step': 617, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:43.994275', 'step': 617, 'epoch': 1} {'type': 'loss', 'content': 0.013743719086050987, 'timestamp': '2025-10-01 04:11:43.999529', 'step': 618, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:44.046112', 'step': 618, 'epoch': 1} {'type': 'loss', 'content': 0.025667309761047363, 'timestamp': '2025-10-01 04:11:44.050403', 'step': 619, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:44.087058', 'step': 619, 'epoch': 1} {'type': 'loss', 'content': 0.008081094361841679, 'timestamp': '2025-10-01 04:11:44.113099', 'step': 620, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:44.150732', 'step': 620, 'epoch': 1} {'type': 'loss', 'content': 0.0131992744281888, 'timestamp': '2025-10-01 04:11:44.155427', 'step': 621, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:44.193956', 'step': 621, 'epoch': 1} {'type': 'loss', 'content': 0.0235703457146883, 'timestamp': '2025-10-01 04:11:44.198410', 'step': 622, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:44.234797', 'step': 622, 'epoch': 1} {'type': 'loss', 'content': 0.0194076057523489, 'timestamp': '2025-10-01 04:11:44.242420', 'step': 623, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:44.293021', 'step': 623, 'epoch': 1} {'type': 'loss', 'content': 0.053576499223709106, 'timestamp': '2025-10-01 04:11:44.322495', 'step': 624, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:44.366256', 'step': 624, 'epoch': 1} {'type': 'loss', 'content': 0.0185772143304348, 'timestamp': '2025-10-01 04:11:44.369297', 'step': 625, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:44.409271', 'step': 625, 'epoch': 1} {'type': 'loss', 'content': 0.01065006759017706, 'timestamp': '2025-10-01 04:11:44.415553', 'step': 626, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:44.458647', 'step': 626, 'epoch': 1} {'type': 'loss', 'content': 0.00875663198530674, 'timestamp': '2025-10-01 04:11:44.461503', 'step': 627, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:11:45.699108', 'step': 627, 'epoch': 1} {'type': 'pplx', 'content': 127842067.85324596, 'timestamp': '2025-10-01 04:11:45.704120', 'step': 627, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:45.748908', 'step': 627, 'epoch': 1} {'type': 'loss', 'content': 0.00792365800589323, 'timestamp': '2025-10-01 04:11:45.774658', 'step': 628, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:45.826805', 'step': 628, 'epoch': 1} {'type': 'loss', 'content': 0.039805445820093155, 'timestamp': '2025-10-01 04:11:45.837193', 'step': 629, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:45.884717', 'step': 629, 'epoch': 1} {'type': 'loss', 'content': 0.0297335684299469, 'timestamp': '2025-10-01 04:11:45.888021', 'step': 630, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:45.926800', 'step': 630, 'epoch': 1} {'type': 'loss', 'content': 0.017810344696044922, 'timestamp': '2025-10-01 04:11:45.929150', 'step': 631, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:45.962978', 'step': 631, 'epoch': 1} {'type': 'loss', 'content': 0.018673760816454887, 'timestamp': '2025-10-01 04:11:45.987846', 'step': 632, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:46.038102', 'step': 632, 'epoch': 1} {'type': 'loss', 'content': 0.009931197389960289, 'timestamp': '2025-10-01 04:11:46.040828', 'step': 633, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:46.084540', 'step': 633, 'epoch': 1} {'type': 'loss', 'content': 0.012832448817789555, 'timestamp': '2025-10-01 04:11:46.093694', 'step': 634, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:46.142397', 'step': 634, 'epoch': 1} {'type': 'loss', 'content': 0.02591419219970703, 'timestamp': '2025-10-01 04:11:46.151771', 'step': 635, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:46.194918', 'step': 635, 'epoch': 1} {'type': 'loss', 'content': 0.013405759818851948, 'timestamp': '2025-10-01 04:11:46.222594', 'step': 636, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:46.272078', 'step': 636, 'epoch': 1} {'type': 'loss', 'content': 0.008755778893828392, 'timestamp': '2025-10-01 04:11:46.283053', 'step': 637, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:46.343153', 'step': 637, 'epoch': 1} {'type': 'loss', 'content': 0.022010071203112602, 'timestamp': '2025-10-01 04:11:46.348247', 'step': 638, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:46.387259', 'step': 638, 'epoch': 1} {'type': 'loss', 'content': 0.028203662484884262, 'timestamp': '2025-10-01 04:11:46.396811', 'step': 639, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:46.452951', 'step': 639, 'epoch': 1} {'type': 'loss', 'content': 0.00725026847794652, 'timestamp': '2025-10-01 04:11:46.485321', 'step': 640, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:46.528735', 'step': 640, 'epoch': 1} {'type': 'loss', 'content': 0.011902433820068836, 'timestamp': '2025-10-01 04:11:46.533869', 'step': 641, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:46.582609', 'step': 641, 'epoch': 1} {'type': 'loss', 'content': 0.01911485567688942, 'timestamp': '2025-10-01 04:11:46.593115', 'step': 642, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:46.652808', 'step': 642, 'epoch': 1} {'type': 'loss', 'content': 0.014642453752458096, 'timestamp': '2025-10-01 04:11:46.663324', 'step': 643, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:46.716292', 'step': 643, 'epoch': 1} {'type': 'loss', 'content': 0.030542129650712013, 'timestamp': '2025-10-01 04:11:46.747460', 'step': 644, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:46.791680', 'step': 644, 'epoch': 1} {'type': 'loss', 'content': 0.013660822995007038, 'timestamp': '2025-10-01 04:11:46.795017', 'step': 645, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:46.837949', 'step': 645, 'epoch': 1} {'type': 'loss', 'content': 0.028145968914031982, 'timestamp': '2025-10-01 04:11:46.850772', 'step': 646, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:46.914406', 'step': 646, 'epoch': 1} {'type': 'loss', 'content': 0.014101878739893436, 'timestamp': '2025-10-01 04:11:46.918770', 'step': 647, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:46.976185', 'step': 647, 'epoch': 1} {'type': 'loss', 'content': 0.012868748977780342, 'timestamp': '2025-10-01 04:11:47.008560', 'step': 648, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:47.062657', 'step': 648, 'epoch': 1} {'type': 'loss', 'content': 0.013591781258583069, 'timestamp': '2025-10-01 04:11:47.075665', 'step': 649, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:47.135458', 'step': 649, 'epoch': 1} {'type': 'loss', 'content': 0.010220631957054138, 'timestamp': '2025-10-01 04:11:47.140479', 'step': 650, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:47.205954', 'step': 650, 'epoch': 1} {'type': 'loss', 'content': 0.017303917557001114, 'timestamp': '2025-10-01 04:11:47.219650', 'step': 651, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:47.271695', 'step': 651, 'epoch': 1} {'type': 'loss', 'content': 0.014156176708638668, 'timestamp': '2025-10-01 04:11:47.303711', 'step': 652, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:47.351934', 'step': 652, 'epoch': 1} {'type': 'loss', 'content': 0.019841428846120834, 'timestamp': '2025-10-01 04:11:47.359327', 'step': 653, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:47.403875', 'step': 653, 'epoch': 1} {'type': 'loss', 'content': 0.02891084924340248, 'timestamp': '2025-10-01 04:11:47.411774', 'step': 654, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:47.460795', 'step': 654, 'epoch': 1} {'type': 'loss', 'content': 0.01000217255204916, 'timestamp': '2025-10-01 04:11:47.469112', 'step': 655, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:47.522327', 'step': 655, 'epoch': 1} {'type': 'loss', 'content': 0.007814330980181694, 'timestamp': '2025-10-01 04:11:47.549062', 'step': 656, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:47.605789', 'step': 656, 'epoch': 1} {'type': 'loss', 'content': 0.004295154474675655, 'timestamp': '2025-10-01 04:11:47.609624', 'step': 657, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:47.660242', 'step': 657, 'epoch': 1} {'type': 'loss', 'content': 0.021786142140626907, 'timestamp': '2025-10-01 04:11:47.665898', 'step': 658, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:47.722907', 'step': 658, 'epoch': 1} {'type': 'loss', 'content': 0.006306357216089964, 'timestamp': '2025-10-01 04:11:47.727506', 'step': 659, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:47.787277', 'step': 659, 'epoch': 1} {'type': 'loss', 'content': 0.012746252119541168, 'timestamp': '2025-10-01 04:11:47.812729', 'step': 660, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:47.865566', 'step': 660, 'epoch': 1} {'type': 'loss', 'content': 0.010227000340819359, 'timestamp': '2025-10-01 04:11:47.870772', 'step': 661, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:47.909963', 'step': 661, 'epoch': 1} {'type': 'loss', 'content': 0.00480778981000185, 'timestamp': '2025-10-01 04:11:47.922171', 'step': 662, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:47.986343', 'step': 662, 'epoch': 1} {'type': 'loss', 'content': 0.01734638772904873, 'timestamp': '2025-10-01 04:11:47.998855', 'step': 663, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:48.046199', 'step': 663, 'epoch': 1} {'type': 'loss', 'content': 0.017665620893239975, 'timestamp': '2025-10-01 04:11:48.080409', 'step': 664, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:48.132670', 'step': 664, 'epoch': 1} {'type': 'loss', 'content': 0.009129605256021023, 'timestamp': '2025-10-01 04:11:48.136471', 'step': 665, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:48.183725', 'step': 665, 'epoch': 1} {'type': 'loss', 'content': 0.0035495725460350513, 'timestamp': '2025-10-01 04:11:48.188771', 'step': 666, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:48.242315', 'step': 666, 'epoch': 1} {'type': 'loss', 'content': 0.008134677074849606, 'timestamp': '2025-10-01 04:11:48.245520', 'step': 667, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:48.293132', 'step': 667, 'epoch': 1} {'type': 'loss', 'content': 0.00795725453644991, 'timestamp': '2025-10-01 04:11:48.329553', 'step': 668, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:48.380905', 'step': 668, 'epoch': 1} {'type': 'loss', 'content': 0.053504396229982376, 'timestamp': '2025-10-01 04:11:48.385393', 'step': 669, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:48.435741', 'step': 669, 'epoch': 1} {'type': 'loss', 'content': 0.05954793840646744, 'timestamp': '2025-10-01 04:11:48.447124', 'step': 670, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:48.504910', 'step': 670, 'epoch': 1} {'type': 'loss', 'content': 0.008816796354949474, 'timestamp': '2025-10-01 04:11:48.509743', 'step': 671, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:48.560990', 'step': 671, 'epoch': 1} {'type': 'loss', 'content': 0.017331920564174652, 'timestamp': '2025-10-01 04:11:48.591951', 'step': 672, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:48.642732', 'step': 672, 'epoch': 1} {'type': 'loss', 'content': 0.02753291465342045, 'timestamp': '2025-10-01 04:11:48.652747', 'step': 673, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:48.691722', 'step': 673, 'epoch': 1} {'type': 'loss', 'content': 0.029163358733057976, 'timestamp': '2025-10-01 04:11:48.700026', 'step': 674, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:48.745563', 'step': 674, 'epoch': 1} {'type': 'loss', 'content': 0.05267322063446045, 'timestamp': '2025-10-01 04:11:48.749392', 'step': 675, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:48.801086', 'step': 675, 'epoch': 1} {'type': 'loss', 'content': 0.01871684566140175, 'timestamp': '2025-10-01 04:11:48.827543', 'step': 676, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:48.878793', 'step': 676, 'epoch': 1} {'type': 'loss', 'content': 0.025403184816241264, 'timestamp': '2025-10-01 04:11:48.882640', 'step': 677, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:48.944070', 'step': 677, 'epoch': 1} {'type': 'loss', 'content': 0.006062587257474661, 'timestamp': '2025-10-01 04:11:48.959017', 'step': 678, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:49.009924', 'step': 678, 'epoch': 1} {'type': 'loss', 'content': 0.005435396917164326, 'timestamp': '2025-10-01 04:11:49.019398', 'step': 679, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:49.066613', 'step': 679, 'epoch': 1} {'type': 'loss', 'content': 0.017074231058359146, 'timestamp': '2025-10-01 04:11:49.100097', 'step': 680, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:49.158526', 'step': 680, 'epoch': 1} {'type': 'loss', 'content': 0.011236979626119137, 'timestamp': '2025-10-01 04:11:49.170680', 'step': 681, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:49.220550', 'step': 681, 'epoch': 1} {'type': 'loss', 'content': 0.01346462219953537, 'timestamp': '2025-10-01 04:11:49.229964', 'step': 682, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:49.287818', 'step': 682, 'epoch': 1} {'type': 'loss', 'content': 0.01821001246571541, 'timestamp': '2025-10-01 04:11:49.298681', 'step': 683, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:49.358282', 'step': 683, 'epoch': 1} {'type': 'loss', 'content': 0.010340625420212746, 'timestamp': '2025-10-01 04:11:49.390650', 'step': 684, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:11:50.690899', 'step': 684, 'epoch': 1} {'type': 'pplx', 'content': 119262264.46404168, 'timestamp': '2025-10-01 04:11:50.699070', 'step': 684, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:50.736407', 'step': 684, 'epoch': 1} {'type': 'loss', 'content': 0.03107767179608345, 'timestamp': '2025-10-01 04:11:50.739416', 'step': 685, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:50.778891', 'step': 685, 'epoch': 1} {'type': 'loss', 'content': 0.016809258610010147, 'timestamp': '2025-10-01 04:11:50.784183', 'step': 686, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:50.821329', 'step': 686, 'epoch': 1} {'type': 'loss', 'content': 0.007657169364392757, 'timestamp': '2025-10-01 04:11:50.823773', 'step': 687, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:50.858616', 'step': 687, 'epoch': 1} {'type': 'loss', 'content': 0.009451446123421192, 'timestamp': '2025-10-01 04:11:50.886751', 'step': 688, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:50.930086', 'step': 688, 'epoch': 1} {'type': 'loss', 'content': 0.036196958273649216, 'timestamp': '2025-10-01 04:11:50.935044', 'step': 689, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:50.973134', 'step': 689, 'epoch': 1} {'type': 'loss', 'content': 0.011105935089290142, 'timestamp': '2025-10-01 04:11:50.975638', 'step': 690, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:51.015717', 'step': 690, 'epoch': 1} {'type': 'loss', 'content': 0.00982974749058485, 'timestamp': '2025-10-01 04:11:51.019017', 'step': 691, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:51.059996', 'step': 691, 'epoch': 1} {'type': 'loss', 'content': 0.01059667021036148, 'timestamp': '2025-10-01 04:11:51.087081', 'step': 692, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:51.119460', 'step': 692, 'epoch': 1} {'type': 'loss', 'content': 0.01886533759534359, 'timestamp': '2025-10-01 04:11:51.124016', 'step': 693, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:51.159842', 'step': 693, 'epoch': 1} {'type': 'loss', 'content': 0.0357852578163147, 'timestamp': '2025-10-01 04:11:51.165771', 'step': 694, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:51.209301', 'step': 694, 'epoch': 1} {'type': 'loss', 'content': 0.007002557162195444, 'timestamp': '2025-10-01 04:11:51.216887', 'step': 695, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:51.260131', 'step': 695, 'epoch': 1} {'type': 'loss', 'content': 0.029862623661756516, 'timestamp': '2025-10-01 04:11:51.288062', 'step': 696, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:51.326956', 'step': 696, 'epoch': 1} {'type': 'loss', 'content': 0.016306012868881226, 'timestamp': '2025-10-01 04:11:51.331337', 'step': 697, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:51.366272', 'step': 697, 'epoch': 1} {'type': 'loss', 'content': 0.015408610925078392, 'timestamp': '2025-10-01 04:11:51.374520', 'step': 698, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:51.417648', 'step': 698, 'epoch': 1} {'type': 'loss', 'content': 0.025938164442777634, 'timestamp': '2025-10-01 04:11:51.421588', 'step': 699, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:51.471695', 'step': 699, 'epoch': 1} {'type': 'loss', 'content': 0.023498162627220154, 'timestamp': '2025-10-01 04:11:51.502819', 'step': 700, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:51.553242', 'step': 700, 'epoch': 1} {'type': 'loss', 'content': 0.009221481159329414, 'timestamp': '2025-10-01 04:11:51.556404', 'step': 701, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:51.594378', 'step': 701, 'epoch': 1} {'type': 'loss', 'content': 0.02343042753636837, 'timestamp': '2025-10-01 04:11:51.596942', 'step': 702, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:51.636919', 'step': 702, 'epoch': 1} {'type': 'loss', 'content': 0.015130646526813507, 'timestamp': '2025-10-01 04:11:51.643347', 'step': 703, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:51.692594', 'step': 703, 'epoch': 1} {'type': 'loss', 'content': 0.013389564119279385, 'timestamp': '2025-10-01 04:11:51.721853', 'step': 704, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:51.775838', 'step': 704, 'epoch': 1} {'type': 'loss', 'content': 0.019185690209269524, 'timestamp': '2025-10-01 04:11:51.779289', 'step': 705, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:51.814170', 'step': 705, 'epoch': 1} {'type': 'loss', 'content': 0.006280092056840658, 'timestamp': '2025-10-01 04:11:51.817534', 'step': 706, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:51.872723', 'step': 706, 'epoch': 1} {'type': 'loss', 'content': 0.006022441666573286, 'timestamp': '2025-10-01 04:11:51.881486', 'step': 707, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:51.925097', 'step': 707, 'epoch': 1} {'type': 'loss', 'content': 0.014056635089218616, 'timestamp': '2025-10-01 04:11:51.951983', 'step': 708, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:51.995071', 'step': 708, 'epoch': 1} {'type': 'loss', 'content': 0.014901148155331612, 'timestamp': '2025-10-01 04:11:52.000334', 'step': 709, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:52.040164', 'step': 709, 'epoch': 1} {'type': 'loss', 'content': 0.044664185494184494, 'timestamp': '2025-10-01 04:11:52.045037', 'step': 710, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:11:52.085584', 'step': 710, 'epoch': 1} {'type': 'loss', 'content': 0.02443745732307434, 'timestamp': '2025-10-01 04:11:52.090731', 'step': 711, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:52.131077', 'step': 711, 'epoch': 1} {'type': 'loss', 'content': 0.03858523443341255, 'timestamp': '2025-10-01 04:11:52.156477', 'step': 712, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:52.191690', 'step': 712, 'epoch': 1} {'type': 'loss', 'content': 0.016447898000478745, 'timestamp': '2025-10-01 04:11:52.197775', 'step': 713, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:52.239379', 'step': 713, 'epoch': 1} {'type': 'loss', 'content': 0.010604329407215118, 'timestamp': '2025-10-01 04:11:52.243288', 'step': 714, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:52.278282', 'step': 714, 'epoch': 1} {'type': 'loss', 'content': 0.029538193717598915, 'timestamp': '2025-10-01 04:11:52.283351', 'step': 715, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:52.319336', 'step': 715, 'epoch': 1} {'type': 'loss', 'content': 0.040076810866594315, 'timestamp': '2025-10-01 04:11:52.346855', 'step': 716, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:52.389234', 'step': 716, 'epoch': 1} {'type': 'loss', 'content': 0.020004551857709885, 'timestamp': '2025-10-01 04:11:52.391875', 'step': 717, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:52.428625', 'step': 717, 'epoch': 1} {'type': 'loss', 'content': 0.016628902405500412, 'timestamp': '2025-10-01 04:11:52.433795', 'step': 718, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:52.473938', 'step': 718, 'epoch': 1} {'type': 'loss', 'content': 0.012509877793490887, 'timestamp': '2025-10-01 04:11:52.476953', 'step': 719, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:52.522188', 'step': 719, 'epoch': 1} {'type': 'loss', 'content': 0.031048806384205818, 'timestamp': '2025-10-01 04:11:52.550284', 'step': 720, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:52.594671', 'step': 720, 'epoch': 1} {'type': 'loss', 'content': 0.005084963981062174, 'timestamp': '2025-10-01 04:11:52.597221', 'step': 721, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:52.638860', 'step': 721, 'epoch': 1} {'type': 'loss', 'content': 0.004975512158125639, 'timestamp': '2025-10-01 04:11:52.646864', 'step': 722, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:52.686807', 'step': 722, 'epoch': 1} {'type': 'loss', 'content': 0.012422831729054451, 'timestamp': '2025-10-01 04:11:52.696729', 'step': 723, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:52.742894', 'step': 723, 'epoch': 1} {'type': 'loss', 'content': 0.02685965597629547, 'timestamp': '2025-10-01 04:11:52.770524', 'step': 724, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:52.810267', 'step': 724, 'epoch': 1} {'type': 'loss', 'content': 0.038959138095378876, 'timestamp': '2025-10-01 04:11:52.815399', 'step': 725, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:52.855285', 'step': 725, 'epoch': 1} {'type': 'loss', 'content': 0.021698709577322006, 'timestamp': '2025-10-01 04:11:52.858958', 'step': 726, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:52.895979', 'step': 726, 'epoch': 1} {'type': 'loss', 'content': 0.016632195562124252, 'timestamp': '2025-10-01 04:11:52.901026', 'step': 727, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:52.941132', 'step': 727, 'epoch': 1} {'type': 'loss', 'content': 0.009156654588878155, 'timestamp': '2025-10-01 04:11:52.967868', 'step': 728, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:53.001799', 'step': 728, 'epoch': 1} {'type': 'loss', 'content': 0.009549522772431374, 'timestamp': '2025-10-01 04:11:53.006254', 'step': 729, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:53.041607', 'step': 729, 'epoch': 1} {'type': 'loss', 'content': 0.03153882548213005, 'timestamp': '2025-10-01 04:11:53.050118', 'step': 730, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:53.091813', 'step': 730, 'epoch': 1} {'type': 'loss', 'content': 0.008350717835128307, 'timestamp': '2025-10-01 04:11:53.094476', 'step': 731, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:53.139106', 'step': 731, 'epoch': 1} {'type': 'loss', 'content': 0.02463681623339653, 'timestamp': '2025-10-01 04:11:53.169284', 'step': 732, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:53.212469', 'step': 732, 'epoch': 1} {'type': 'loss', 'content': 0.019675912335515022, 'timestamp': '2025-10-01 04:11:53.220057', 'step': 733, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:53.256920', 'step': 733, 'epoch': 1} {'type': 'loss', 'content': 0.01801421120762825, 'timestamp': '2025-10-01 04:11:53.267495', 'step': 734, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:53.327589', 'step': 734, 'epoch': 1} {'type': 'loss', 'content': 0.015794653445482254, 'timestamp': '2025-10-01 04:11:53.335394', 'step': 735, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:53.374717', 'step': 735, 'epoch': 1} {'type': 'loss', 'content': 0.010897762142121792, 'timestamp': '2025-10-01 04:11:53.405601', 'step': 736, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:53.445501', 'step': 736, 'epoch': 1} {'type': 'loss', 'content': 0.02856982871890068, 'timestamp': '2025-10-01 04:11:53.454843', 'step': 737, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:53.496780', 'step': 737, 'epoch': 1} {'type': 'loss', 'content': 0.014090476557612419, 'timestamp': '2025-10-01 04:11:53.500010', 'step': 738, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:53.563469', 'step': 738, 'epoch': 1} {'type': 'loss', 'content': 0.022237354889512062, 'timestamp': '2025-10-01 04:11:53.573453', 'step': 739, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:53.627816', 'step': 739, 'epoch': 1} {'type': 'loss', 'content': 0.0287479180842638, 'timestamp': '2025-10-01 04:11:53.661257', 'step': 740, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:53.714096', 'step': 740, 'epoch': 1} {'type': 'loss', 'content': 0.028681013733148575, 'timestamp': '2025-10-01 04:11:53.724233', 'step': 741, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:11:54.833457', 'step': 741, 'epoch': 1} {'type': 'pplx', 'content': 108961684.90475054, 'timestamp': '2025-10-01 04:11:54.839330', 'step': 741, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:54.872578', 'step': 741, 'epoch': 1} {'type': 'loss', 'content': 0.013316790573298931, 'timestamp': '2025-10-01 04:11:54.875134', 'step': 742, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:54.914749', 'step': 742, 'epoch': 1} {'type': 'loss', 'content': 0.015716271474957466, 'timestamp': '2025-10-01 04:11:54.923268', 'step': 743, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:54.965364', 'step': 743, 'epoch': 1} {'type': 'loss', 'content': 0.03807631880044937, 'timestamp': '2025-10-01 04:11:54.990275', 'step': 744, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:55.026831', 'step': 744, 'epoch': 1} {'type': 'loss', 'content': 0.014975814148783684, 'timestamp': '2025-10-01 04:11:55.028965', 'step': 745, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:55.065030', 'step': 745, 'epoch': 1} {'type': 'loss', 'content': 0.015280152671039104, 'timestamp': '2025-10-01 04:11:55.068313', 'step': 746, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:55.101661', 'step': 746, 'epoch': 1} {'type': 'loss', 'content': 0.012907147407531738, 'timestamp': '2025-10-01 04:11:55.103929', 'step': 747, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:55.138170', 'step': 747, 'epoch': 1} {'type': 'loss', 'content': 0.029170582070946693, 'timestamp': '2025-10-01 04:11:55.162265', 'step': 748, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:55.194297', 'step': 748, 'epoch': 1} {'type': 'loss', 'content': 0.023870984092354774, 'timestamp': '2025-10-01 04:11:55.197224', 'step': 749, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:55.230218', 'step': 749, 'epoch': 1} {'type': 'loss', 'content': 0.019887370988726616, 'timestamp': '2025-10-01 04:11:55.233192', 'step': 750, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:55.266420', 'step': 750, 'epoch': 1} {'type': 'loss', 'content': 0.021655777469277382, 'timestamp': '2025-10-01 04:11:55.269631', 'step': 751, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:55.302756', 'step': 751, 'epoch': 1} {'type': 'loss', 'content': 0.03200121968984604, 'timestamp': '2025-10-01 04:11:55.326805', 'step': 752, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:55.362029', 'step': 752, 'epoch': 1} {'type': 'loss', 'content': 0.02003059722483158, 'timestamp': '2025-10-01 04:11:55.367595', 'step': 753, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:55.401802', 'step': 753, 'epoch': 1} {'type': 'loss', 'content': 0.0149669349193573, 'timestamp': '2025-10-01 04:11:55.414367', 'step': 754, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:55.458123', 'step': 754, 'epoch': 1} {'type': 'loss', 'content': 0.018943343311548233, 'timestamp': '2025-10-01 04:11:55.463901', 'step': 755, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:55.507977', 'step': 755, 'epoch': 1} {'type': 'loss', 'content': 0.02300729975104332, 'timestamp': '2025-10-01 04:11:55.536691', 'step': 756, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:55.583594', 'step': 756, 'epoch': 1} {'type': 'loss', 'content': 0.006794582586735487, 'timestamp': '2025-10-01 04:11:55.592780', 'step': 757, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:55.638867', 'step': 757, 'epoch': 1} {'type': 'loss', 'content': 0.007976575754582882, 'timestamp': '2025-10-01 04:11:55.647286', 'step': 758, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:55.700965', 'step': 758, 'epoch': 1} {'type': 'loss', 'content': 0.011165034957230091, 'timestamp': '2025-10-01 04:11:55.709016', 'step': 759, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:55.749078', 'step': 759, 'epoch': 1} {'type': 'loss', 'content': 0.014530644752085209, 'timestamp': '2025-10-01 04:11:55.778360', 'step': 760, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:55.814274', 'step': 760, 'epoch': 1} {'type': 'loss', 'content': 0.00375539087690413, 'timestamp': '2025-10-01 04:11:55.817577', 'step': 761, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:11:55.858699', 'step': 761, 'epoch': 1} {'type': 'loss', 'content': 0.006393683608621359, 'timestamp': '2025-10-01 04:11:55.861649', 'step': 762, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:55.897046', 'step': 762, 'epoch': 1} {'type': 'loss', 'content': 0.04293840005993843, 'timestamp': '2025-10-01 04:11:55.899758', 'step': 763, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:55.934308', 'step': 763, 'epoch': 1} {'type': 'loss', 'content': 0.01398535817861557, 'timestamp': '2025-10-01 04:11:55.959440', 'step': 764, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:55.994567', 'step': 764, 'epoch': 1} {'type': 'loss', 'content': 0.00975012220442295, 'timestamp': '2025-10-01 04:11:55.999939', 'step': 765, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:56.035453', 'step': 765, 'epoch': 1} {'type': 'loss', 'content': 0.020558195188641548, 'timestamp': '2025-10-01 04:11:56.040047', 'step': 766, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:56.076561', 'step': 766, 'epoch': 1} {'type': 'loss', 'content': 0.010339805856347084, 'timestamp': '2025-10-01 04:11:56.079922', 'step': 767, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:56.117425', 'step': 767, 'epoch': 1} {'type': 'loss', 'content': 0.05061133950948715, 'timestamp': '2025-10-01 04:11:56.146045', 'step': 768, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:56.190804', 'step': 768, 'epoch': 1} {'type': 'loss', 'content': 0.016301441937685013, 'timestamp': '2025-10-01 04:11:56.194700', 'step': 769, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:56.236612', 'step': 769, 'epoch': 1} {'type': 'loss', 'content': 0.030423134565353394, 'timestamp': '2025-10-01 04:11:56.246395', 'step': 770, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:56.299963', 'step': 770, 'epoch': 1} {'type': 'loss', 'content': 0.027092410251498222, 'timestamp': '2025-10-01 04:11:56.309971', 'step': 771, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:56.369486', 'step': 771, 'epoch': 1} {'type': 'loss', 'content': 0.004554868675768375, 'timestamp': '2025-10-01 04:11:56.393571', 'step': 772, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:56.445387', 'step': 772, 'epoch': 1} {'type': 'loss', 'content': 0.05927324295043945, 'timestamp': '2025-10-01 04:11:56.454927', 'step': 773, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:56.503918', 'step': 773, 'epoch': 1} {'type': 'loss', 'content': 0.003543505212292075, 'timestamp': '2025-10-01 04:11:56.511457', 'step': 774, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:56.556983', 'step': 774, 'epoch': 1} {'type': 'loss', 'content': 0.03816291317343712, 'timestamp': '2025-10-01 04:11:56.562564', 'step': 775, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:56.598733', 'step': 775, 'epoch': 1} {'type': 'loss', 'content': 0.03273823857307434, 'timestamp': '2025-10-01 04:11:56.627274', 'step': 776, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:56.671804', 'step': 776, 'epoch': 1} {'type': 'loss', 'content': 0.004695338662713766, 'timestamp': '2025-10-01 04:11:56.675230', 'step': 777, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:56.728301', 'step': 777, 'epoch': 1} {'type': 'loss', 'content': 0.002485628006979823, 'timestamp': '2025-10-01 04:11:56.738272', 'step': 778, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:56.783631', 'step': 778, 'epoch': 1} {'type': 'loss', 'content': 0.027432991191744804, 'timestamp': '2025-10-01 04:11:56.786230', 'step': 779, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:56.828549', 'step': 779, 'epoch': 1} {'type': 'loss', 'content': 0.007833785377442837, 'timestamp': '2025-10-01 04:11:56.858782', 'step': 780, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:56.906567', 'step': 780, 'epoch': 1} {'type': 'loss', 'content': 0.025769120082259178, 'timestamp': '2025-10-01 04:11:56.916001', 'step': 781, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:56.955938', 'step': 781, 'epoch': 1} {'type': 'loss', 'content': 0.02049829252064228, 'timestamp': '2025-10-01 04:11:56.960001', 'step': 782, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:56.997915', 'step': 782, 'epoch': 1} {'type': 'loss', 'content': 0.03482077643275261, 'timestamp': '2025-10-01 04:11:57.003600', 'step': 783, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:57.039411', 'step': 783, 'epoch': 1} {'type': 'loss', 'content': 0.014167026616632938, 'timestamp': '2025-10-01 04:11:57.063203', 'step': 784, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:57.108893', 'step': 784, 'epoch': 1} {'type': 'loss', 'content': 0.008381464518606663, 'timestamp': '2025-10-01 04:11:57.116584', 'step': 785, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:57.167527', 'step': 785, 'epoch': 1} {'type': 'loss', 'content': 0.005195831414312124, 'timestamp': '2025-10-01 04:11:57.174777', 'step': 786, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:57.232737', 'step': 786, 'epoch': 1} {'type': 'loss', 'content': 0.0064064571633934975, 'timestamp': '2025-10-01 04:11:57.237699', 'step': 787, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:57.291393', 'step': 787, 'epoch': 1} {'type': 'loss', 'content': 0.025258291512727737, 'timestamp': '2025-10-01 04:11:57.322812', 'step': 788, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:57.378250', 'step': 788, 'epoch': 1} {'type': 'loss', 'content': 0.011926391161978245, 'timestamp': '2025-10-01 04:11:57.385280', 'step': 789, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:57.431924', 'step': 789, 'epoch': 1} {'type': 'loss', 'content': 0.006661646068096161, 'timestamp': '2025-10-01 04:11:57.434666', 'step': 790, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:57.479799', 'step': 790, 'epoch': 1} {'type': 'loss', 'content': 0.01632728986442089, 'timestamp': '2025-10-01 04:11:57.483261', 'step': 791, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:57.532253', 'step': 791, 'epoch': 1} {'type': 'loss', 'content': 0.02356698177754879, 'timestamp': '2025-10-01 04:11:57.559952', 'step': 792, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:57.600552', 'step': 792, 'epoch': 1} {'type': 'loss', 'content': 0.015473469160497189, 'timestamp': '2025-10-01 04:11:57.607668', 'step': 793, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:57.650351', 'step': 793, 'epoch': 1} {'type': 'loss', 'content': 0.034613873809576035, 'timestamp': '2025-10-01 04:11:57.656551', 'step': 794, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:57.697746', 'step': 794, 'epoch': 1} {'type': 'loss', 'content': 0.014318762347102165, 'timestamp': '2025-10-01 04:11:57.703561', 'step': 795, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:57.744198', 'step': 795, 'epoch': 1} {'type': 'loss', 'content': 0.008876429870724678, 'timestamp': '2025-10-01 04:11:57.771183', 'step': 796, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:57.813699', 'step': 796, 'epoch': 1} {'type': 'loss', 'content': 0.012135490775108337, 'timestamp': '2025-10-01 04:11:57.816535', 'step': 797, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:57.857964', 'step': 797, 'epoch': 1} {'type': 'loss', 'content': 0.017030585557222366, 'timestamp': '2025-10-01 04:11:57.863410', 'step': 798, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:11:59.172396', 'step': 798, 'epoch': 1} {'type': 'pplx', 'content': 102363157.80185008, 'timestamp': '2025-10-01 04:11:59.178535', 'step': 798, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:59.218950', 'step': 798, 'epoch': 1} {'type': 'loss', 'content': 0.008845179341733456, 'timestamp': '2025-10-01 04:11:59.225271', 'step': 799, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:59.265737', 'step': 799, 'epoch': 1} {'type': 'loss', 'content': 0.036368466913700104, 'timestamp': '2025-10-01 04:11:59.295196', 'step': 800, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:59.337808', 'step': 800, 'epoch': 1} {'type': 'loss', 'content': 0.025295143947005272, 'timestamp': '2025-10-01 04:11:59.343996', 'step': 801, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:59.383178', 'step': 801, 'epoch': 1} {'type': 'loss', 'content': 0.027338379994034767, 'timestamp': '2025-10-01 04:11:59.386776', 'step': 802, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:59.421196', 'step': 802, 'epoch': 1} {'type': 'loss', 'content': 0.024521732702851295, 'timestamp': '2025-10-01 04:11:59.424105', 'step': 803, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:59.458420', 'step': 803, 'epoch': 1} {'type': 'loss', 'content': 0.012448400259017944, 'timestamp': '2025-10-01 04:11:59.482844', 'step': 804, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:59.516700', 'step': 804, 'epoch': 1} {'type': 'loss', 'content': 0.02401571348309517, 'timestamp': '2025-10-01 04:11:59.520582', 'step': 805, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:59.557363', 'step': 805, 'epoch': 1} {'type': 'loss', 'content': 0.023896140977740288, 'timestamp': '2025-10-01 04:11:59.560563', 'step': 806, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:59.598301', 'step': 806, 'epoch': 1} {'type': 'loss', 'content': 0.013595336116850376, 'timestamp': '2025-10-01 04:11:59.603392', 'step': 807, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:59.642625', 'step': 807, 'epoch': 1} {'type': 'loss', 'content': 0.019209247082471848, 'timestamp': '2025-10-01 04:11:59.667810', 'step': 808, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:59.703976', 'step': 808, 'epoch': 1} {'type': 'loss', 'content': 0.01122989971190691, 'timestamp': '2025-10-01 04:11:59.707556', 'step': 809, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:11:59.741656', 'step': 809, 'epoch': 1} {'type': 'loss', 'content': 0.015157909132540226, 'timestamp': '2025-10-01 04:11:59.747150', 'step': 810, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:59.786437', 'step': 810, 'epoch': 1} {'type': 'loss', 'content': 0.023954039439558983, 'timestamp': '2025-10-01 04:11:59.792755', 'step': 811, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:59.834677', 'step': 811, 'epoch': 1} {'type': 'loss', 'content': 0.01369868777692318, 'timestamp': '2025-10-01 04:11:59.861648', 'step': 812, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:11:59.902049', 'step': 812, 'epoch': 1} {'type': 'loss', 'content': 0.021969353780150414, 'timestamp': '2025-10-01 04:11:59.905681', 'step': 813, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:11:59.949346', 'step': 813, 'epoch': 1} {'type': 'loss', 'content': 0.015957282856106758, 'timestamp': '2025-10-01 04:11:59.959602', 'step': 814, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:00.017177', 'step': 814, 'epoch': 1} {'type': 'loss', 'content': 0.010398841463029385, 'timestamp': '2025-10-01 04:12:00.025020', 'step': 815, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:00.076324', 'step': 815, 'epoch': 1} {'type': 'loss', 'content': 0.017073335126042366, 'timestamp': '2025-10-01 04:12:00.107522', 'step': 816, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:00.155939', 'step': 816, 'epoch': 1} {'type': 'loss', 'content': 0.03205118328332901, 'timestamp': '2025-10-01 04:12:00.166664', 'step': 817, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:00.208485', 'step': 817, 'epoch': 1} {'type': 'loss', 'content': 0.018138429149985313, 'timestamp': '2025-10-01 04:12:00.211567', 'step': 818, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:00.257646', 'step': 818, 'epoch': 1} {'type': 'loss', 'content': 0.010016137734055519, 'timestamp': '2025-10-01 04:12:00.261962', 'step': 819, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:00.313960', 'step': 819, 'epoch': 1} {'type': 'loss', 'content': 0.010501930490136147, 'timestamp': '2025-10-01 04:12:00.343409', 'step': 820, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:00.396914', 'step': 820, 'epoch': 1} {'type': 'loss', 'content': 0.012027454562485218, 'timestamp': '2025-10-01 04:12:00.409559', 'step': 821, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:00.469584', 'step': 821, 'epoch': 1} {'type': 'loss', 'content': 0.017790386453270912, 'timestamp': '2025-10-01 04:12:00.480896', 'step': 822, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:00.538976', 'step': 822, 'epoch': 1} {'type': 'loss', 'content': 0.014075555838644505, 'timestamp': '2025-10-01 04:12:00.551299', 'step': 823, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:00.602096', 'step': 823, 'epoch': 1} {'type': 'loss', 'content': 0.02951137162744999, 'timestamp': '2025-10-01 04:12:00.635264', 'step': 824, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:00.691076', 'step': 824, 'epoch': 1} {'type': 'loss', 'content': 0.01087740994989872, 'timestamp': '2025-10-01 04:12:00.697185', 'step': 825, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:00.751038', 'step': 825, 'epoch': 1} {'type': 'loss', 'content': 0.015133983455598354, 'timestamp': '2025-10-01 04:12:00.757248', 'step': 826, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:00.801679', 'step': 826, 'epoch': 1} {'type': 'loss', 'content': 0.03115757927298546, 'timestamp': '2025-10-01 04:12:00.803949', 'step': 827, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:00.849409', 'step': 827, 'epoch': 1} {'type': 'loss', 'content': 0.012817888520658016, 'timestamp': '2025-10-01 04:12:00.878164', 'step': 828, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:00.933169', 'step': 828, 'epoch': 1} {'type': 'loss', 'content': 0.01865779235959053, 'timestamp': '2025-10-01 04:12:00.942699', 'step': 829, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:01.000681', 'step': 829, 'epoch': 1} {'type': 'loss', 'content': 0.005558286793529987, 'timestamp': '2025-10-01 04:12:01.005026', 'step': 830, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:01.065180', 'step': 830, 'epoch': 1} {'type': 'loss', 'content': 0.007820365950465202, 'timestamp': '2025-10-01 04:12:01.078400', 'step': 831, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:01.140996', 'step': 831, 'epoch': 1} {'type': 'loss', 'content': 0.006412907503545284, 'timestamp': '2025-10-01 04:12:01.173058', 'step': 832, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:01.218904', 'step': 832, 'epoch': 1} {'type': 'loss', 'content': 0.028616532683372498, 'timestamp': '2025-10-01 04:12:01.225155', 'step': 833, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:01.266444', 'step': 833, 'epoch': 1} {'type': 'loss', 'content': 0.013076567091047764, 'timestamp': '2025-10-01 04:12:01.273313', 'step': 834, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:01.316739', 'step': 834, 'epoch': 1} {'type': 'loss', 'content': 0.00448998948559165, 'timestamp': '2025-10-01 04:12:01.318783', 'step': 835, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:01.359621', 'step': 835, 'epoch': 1} {'type': 'loss', 'content': 0.007429972290992737, 'timestamp': '2025-10-01 04:12:01.386693', 'step': 836, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:01.431201', 'step': 836, 'epoch': 1} {'type': 'loss', 'content': 0.034416042268276215, 'timestamp': '2025-10-01 04:12:01.439471', 'step': 837, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:01.487945', 'step': 837, 'epoch': 1} {'type': 'loss', 'content': 0.015252464450895786, 'timestamp': '2025-10-01 04:12:01.497006', 'step': 838, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:01.553566', 'step': 838, 'epoch': 1} {'type': 'loss', 'content': 0.030947471037507057, 'timestamp': '2025-10-01 04:12:01.561050', 'step': 839, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:01.607202', 'step': 839, 'epoch': 1} {'type': 'loss', 'content': 0.01654486544430256, 'timestamp': '2025-10-01 04:12:01.632622', 'step': 840, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:01.682363', 'step': 840, 'epoch': 1} {'type': 'loss', 'content': 0.01720511168241501, 'timestamp': '2025-10-01 04:12:01.688159', 'step': 841, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:01.728450', 'step': 841, 'epoch': 1} {'type': 'loss', 'content': 0.031022807583212852, 'timestamp': '2025-10-01 04:12:01.732792', 'step': 842, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:01.769216', 'step': 842, 'epoch': 1} {'type': 'loss', 'content': 0.01400839351117611, 'timestamp': '2025-10-01 04:12:01.775955', 'step': 843, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:01.824158', 'step': 843, 'epoch': 1} {'type': 'loss', 'content': 0.017199808731675148, 'timestamp': '2025-10-01 04:12:01.848412', 'step': 844, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:01.906886', 'step': 844, 'epoch': 1} {'type': 'loss', 'content': 0.0067504760809242725, 'timestamp': '2025-10-01 04:12:01.915826', 'step': 845, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:01.970429', 'step': 845, 'epoch': 1} {'type': 'loss', 'content': 0.0025089120026677847, 'timestamp': '2025-10-01 04:12:01.981948', 'step': 846, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:02.040140', 'step': 846, 'epoch': 1} {'type': 'loss', 'content': 0.030881991609930992, 'timestamp': '2025-10-01 04:12:02.051862', 'step': 847, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:02.112878', 'step': 847, 'epoch': 1} {'type': 'loss', 'content': 0.01905219629406929, 'timestamp': '2025-10-01 04:12:02.138776', 'step': 848, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:02.194945', 'step': 848, 'epoch': 1} {'type': 'loss', 'content': 0.005448561627417803, 'timestamp': '2025-10-01 04:12:02.202879', 'step': 849, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:02.250476', 'step': 849, 'epoch': 1} {'type': 'loss', 'content': 0.003623712807893753, 'timestamp': '2025-10-01 04:12:02.254248', 'step': 850, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:02.303264', 'step': 850, 'epoch': 1} {'type': 'loss', 'content': 0.0068411752581596375, 'timestamp': '2025-10-01 04:12:02.308834', 'step': 851, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:02.350799', 'step': 851, 'epoch': 1} {'type': 'loss', 'content': 0.020454945042729378, 'timestamp': '2025-10-01 04:12:02.383078', 'step': 852, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:02.422018', 'step': 852, 'epoch': 1} {'type': 'loss', 'content': 0.006170716602355242, 'timestamp': '2025-10-01 04:12:02.428049', 'step': 853, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:02.481776', 'step': 853, 'epoch': 1} {'type': 'loss', 'content': 0.022493934258818626, 'timestamp': '2025-10-01 04:12:02.486925', 'step': 854, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:02.522094', 'step': 854, 'epoch': 1} {'type': 'loss', 'content': 0.0045220511965453625, 'timestamp': '2025-10-01 04:12:02.526650', 'step': 855, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:12:03.994172', 'step': 855, 'epoch': 1} {'type': 'pplx', 'content': 107042227.57520783, 'timestamp': '2025-10-01 04:12:04.004953', 'step': 855, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:04.056939', 'step': 855, 'epoch': 1} {'type': 'loss', 'content': 0.0042382897809147835, 'timestamp': '2025-10-01 04:12:04.089067', 'step': 856, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:04.165021', 'step': 856, 'epoch': 1} {'type': 'loss', 'content': 0.014735554344952106, 'timestamp': '2025-10-01 04:12:04.175888', 'step': 857, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:04.226904', 'step': 857, 'epoch': 1} {'type': 'loss', 'content': 0.011172778904438019, 'timestamp': '2025-10-01 04:12:04.236757', 'step': 858, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:04.291464', 'step': 858, 'epoch': 1} {'type': 'loss', 'content': 0.013173661194741726, 'timestamp': '2025-10-01 04:12:04.299580', 'step': 859, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:04.351204', 'step': 859, 'epoch': 1} {'type': 'loss', 'content': 0.027727048844099045, 'timestamp': '2025-10-01 04:12:04.382371', 'step': 860, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:04.434095', 'step': 860, 'epoch': 1} {'type': 'loss', 'content': 0.017057133838534355, 'timestamp': '2025-10-01 04:12:04.443826', 'step': 861, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:04.495787', 'step': 861, 'epoch': 1} {'type': 'loss', 'content': 0.007997279986739159, 'timestamp': '2025-10-01 04:12:04.506394', 'step': 862, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:04.561733', 'step': 862, 'epoch': 1} {'type': 'loss', 'content': 0.01755506731569767, 'timestamp': '2025-10-01 04:12:04.569684', 'step': 863, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:04.625051', 'step': 863, 'epoch': 1} {'type': 'loss', 'content': 0.013182953000068665, 'timestamp': '2025-10-01 04:12:04.649663', 'step': 864, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:04.698884', 'step': 864, 'epoch': 1} {'type': 'loss', 'content': 0.03394387289881706, 'timestamp': '2025-10-01 04:12:04.708783', 'step': 865, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:04.756031', 'step': 865, 'epoch': 1} {'type': 'loss', 'content': 0.009329610504209995, 'timestamp': '2025-10-01 04:12:04.761469', 'step': 866, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:04.802913', 'step': 866, 'epoch': 1} {'type': 'loss', 'content': 0.004415162838995457, 'timestamp': '2025-10-01 04:12:04.809347', 'step': 867, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:04.847986', 'step': 867, 'epoch': 1} {'type': 'loss', 'content': 0.014574938453733921, 'timestamp': '2025-10-01 04:12:04.876622', 'step': 868, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:04.923842', 'step': 868, 'epoch': 1} {'type': 'loss', 'content': 0.01505463756620884, 'timestamp': '2025-10-01 04:12:04.926431', 'step': 869, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:04.971915', 'step': 869, 'epoch': 1} {'type': 'loss', 'content': 0.03582816570997238, 'timestamp': '2025-10-01 04:12:04.981438', 'step': 870, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:05.022620', 'step': 870, 'epoch': 1} {'type': 'loss', 'content': 0.002920368453487754, 'timestamp': '2025-10-01 04:12:05.025098', 'step': 871, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:05.070777', 'step': 871, 'epoch': 1} {'type': 'loss', 'content': 0.002681077690795064, 'timestamp': '2025-10-01 04:12:05.098583', 'step': 872, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:05.145298', 'step': 872, 'epoch': 1} {'type': 'loss', 'content': 0.010480071417987347, 'timestamp': '2025-10-01 04:12:05.153231', 'step': 873, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:05.203275', 'step': 873, 'epoch': 1} {'type': 'loss', 'content': 0.00535059766843915, 'timestamp': '2025-10-01 04:12:05.210753', 'step': 874, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:05.251607', 'step': 874, 'epoch': 1} {'type': 'loss', 'content': 0.02054414339363575, 'timestamp': '2025-10-01 04:12:05.256635', 'step': 875, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:05.293195', 'step': 875, 'epoch': 1} {'type': 'loss', 'content': 0.029474778100848198, 'timestamp': '2025-10-01 04:12:05.318441', 'step': 876, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:05.358847', 'step': 876, 'epoch': 1} {'type': 'loss', 'content': 0.006558016873896122, 'timestamp': '2025-10-01 04:12:05.366568', 'step': 877, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:05.400568', 'step': 877, 'epoch': 1} {'type': 'loss', 'content': 0.0020208796486258507, 'timestamp': '2025-10-01 04:12:05.409495', 'step': 878, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:05.461083', 'step': 878, 'epoch': 1} {'type': 'loss', 'content': 0.030072884634137154, 'timestamp': '2025-10-01 04:12:05.466107', 'step': 879, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:05.522738', 'step': 879, 'epoch': 1} {'type': 'loss', 'content': 0.04798530787229538, 'timestamp': '2025-10-01 04:12:05.547105', 'step': 880, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:05.595769', 'step': 880, 'epoch': 1} {'type': 'loss', 'content': 0.02707165852189064, 'timestamp': '2025-10-01 04:12:05.598633', 'step': 881, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:05.643281', 'step': 881, 'epoch': 1} {'type': 'loss', 'content': 0.04058273881673813, 'timestamp': '2025-10-01 04:12:05.650065', 'step': 882, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:05.706593', 'step': 882, 'epoch': 1} {'type': 'loss', 'content': 0.03420673683285713, 'timestamp': '2025-10-01 04:12:05.717500', 'step': 883, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:05.767365', 'step': 883, 'epoch': 1} {'type': 'loss', 'content': 0.0023017581552267075, 'timestamp': '2025-10-01 04:12:05.792872', 'step': 884, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:05.837273', 'step': 884, 'epoch': 1} {'type': 'loss', 'content': 0.00631122337654233, 'timestamp': '2025-10-01 04:12:05.848497', 'step': 885, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:05.892020', 'step': 885, 'epoch': 1} {'type': 'loss', 'content': 0.02051430568099022, 'timestamp': '2025-10-01 04:12:05.902489', 'step': 886, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:05.950386', 'step': 886, 'epoch': 1} {'type': 'loss', 'content': 0.008391078561544418, 'timestamp': '2025-10-01 04:12:05.962579', 'step': 887, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:05.999759', 'step': 887, 'epoch': 1} {'type': 'loss', 'content': 0.02948450855910778, 'timestamp': '2025-10-01 04:12:06.033511', 'step': 888, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:06.092382', 'step': 888, 'epoch': 1} {'type': 'loss', 'content': 0.017128828912973404, 'timestamp': '2025-10-01 04:12:06.103309', 'step': 889, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:06.169790', 'step': 889, 'epoch': 1} {'type': 'loss', 'content': 0.030220722779631615, 'timestamp': '2025-10-01 04:12:06.176744', 'step': 890, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:06.224499', 'step': 890, 'epoch': 1} {'type': 'loss', 'content': 0.022421736270189285, 'timestamp': '2025-10-01 04:12:06.235003', 'step': 891, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:06.288345', 'step': 891, 'epoch': 1} {'type': 'loss', 'content': 0.015720544382929802, 'timestamp': '2025-10-01 04:12:06.313720', 'step': 892, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:06.361819', 'step': 892, 'epoch': 1} {'type': 'loss', 'content': 0.01451858039945364, 'timestamp': '2025-10-01 04:12:06.367119', 'step': 893, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:06.405891', 'step': 893, 'epoch': 1} {'type': 'loss', 'content': 0.04697972536087036, 'timestamp': '2025-10-01 04:12:06.411375', 'step': 894, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:06.454651', 'step': 894, 'epoch': 1} {'type': 'loss', 'content': 0.021802568808197975, 'timestamp': '2025-10-01 04:12:06.459383', 'step': 895, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:06.502047', 'step': 895, 'epoch': 1} {'type': 'loss', 'content': 0.021685032173991203, 'timestamp': '2025-10-01 04:12:06.528635', 'step': 896, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:06.571771', 'step': 896, 'epoch': 1} {'type': 'loss', 'content': 0.0264862310141325, 'timestamp': '2025-10-01 04:12:06.575865', 'step': 897, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:06.613185', 'step': 897, 'epoch': 1} {'type': 'loss', 'content': 0.014706993475556374, 'timestamp': '2025-10-01 04:12:06.618410', 'step': 898, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:06.656039', 'step': 898, 'epoch': 1} {'type': 'loss', 'content': 0.010481654666364193, 'timestamp': '2025-10-01 04:12:06.660860', 'step': 899, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:06.702758', 'step': 899, 'epoch': 1} {'type': 'loss', 'content': 0.013229523785412312, 'timestamp': '2025-10-01 04:12:06.729066', 'step': 900, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:06.768662', 'step': 900, 'epoch': 1} {'type': 'loss', 'content': 0.026366079226136208, 'timestamp': '2025-10-01 04:12:06.771913', 'step': 901, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:06.809447', 'step': 901, 'epoch': 1} {'type': 'loss', 'content': 0.021769892424345016, 'timestamp': '2025-10-01 04:12:06.813207', 'step': 902, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:06.857845', 'step': 902, 'epoch': 1} {'type': 'loss', 'content': 0.007020240183919668, 'timestamp': '2025-10-01 04:12:06.860778', 'step': 903, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:06.903744', 'step': 903, 'epoch': 1} {'type': 'loss', 'content': 0.035886842757463455, 'timestamp': '2025-10-01 04:12:06.933546', 'step': 904, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:06.981239', 'step': 904, 'epoch': 1} {'type': 'loss', 'content': 0.012589776888489723, 'timestamp': '2025-10-01 04:12:06.984791', 'step': 905, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:07.023171', 'step': 905, 'epoch': 1} {'type': 'loss', 'content': 0.009022383950650692, 'timestamp': '2025-10-01 04:12:07.026527', 'step': 906, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:07.067039', 'step': 906, 'epoch': 1} {'type': 'loss', 'content': 0.03269534558057785, 'timestamp': '2025-10-01 04:12:07.069709', 'step': 907, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:07.106064', 'step': 907, 'epoch': 1} {'type': 'loss', 'content': 0.020773855969309807, 'timestamp': '2025-10-01 04:12:07.132141', 'step': 908, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:07.167996', 'step': 908, 'epoch': 1} {'type': 'loss', 'content': 0.015614612959325314, 'timestamp': '2025-10-01 04:12:07.172464', 'step': 909, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:07.211257', 'step': 909, 'epoch': 1} {'type': 'loss', 'content': 0.0035907472483813763, 'timestamp': '2025-10-01 04:12:07.213758', 'step': 910, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:07.252624', 'step': 910, 'epoch': 1} {'type': 'loss', 'content': 0.016620244830846786, 'timestamp': '2025-10-01 04:12:07.260429', 'step': 911, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:07.306415', 'step': 911, 'epoch': 1} {'type': 'loss', 'content': 0.005902507808059454, 'timestamp': '2025-10-01 04:12:07.339644', 'step': 912, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:12:08.515415', 'step': 912, 'epoch': 1} {'type': 'pplx', 'content': 96790900.33100773, 'timestamp': '2025-10-01 04:12:08.518295', 'step': 912, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:08.549571', 'step': 912, 'epoch': 1} {'type': 'loss', 'content': 0.044129107147455215, 'timestamp': '2025-10-01 04:12:08.555513', 'step': 913, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:08.602448', 'step': 913, 'epoch': 1} {'type': 'loss', 'content': 0.010207407176494598, 'timestamp': '2025-10-01 04:12:08.609138', 'step': 914, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:08.651030', 'step': 914, 'epoch': 1} {'type': 'loss', 'content': 0.006177072878926992, 'timestamp': '2025-10-01 04:12:08.654898', 'step': 915, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:08.690472', 'step': 915, 'epoch': 1} {'type': 'loss', 'content': 0.0061223325319588184, 'timestamp': '2025-10-01 04:12:08.715821', 'step': 916, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:08.761149', 'step': 916, 'epoch': 1} {'type': 'loss', 'content': 0.03369773551821709, 'timestamp': '2025-10-01 04:12:08.769894', 'step': 917, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:08.831986', 'step': 917, 'epoch': 2} {'type': 'loss', 'content': 0.04762774705886841, 'timestamp': '2025-10-01 04:12:08.837318', 'step': 918, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:08.880719', 'step': 918, 'epoch': 2} {'type': 'loss', 'content': 0.018315879628062248, 'timestamp': '2025-10-01 04:12:08.891227', 'step': 919, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:08.960991', 'step': 919, 'epoch': 2} {'type': 'loss', 'content': 0.0561673678457737, 'timestamp': '2025-10-01 04:12:08.993133', 'step': 920, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:09.059612', 'step': 920, 'epoch': 2} {'type': 'loss', 'content': 0.015595680102705956, 'timestamp': '2025-10-01 04:12:09.074119', 'step': 921, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:09.136361', 'step': 921, 'epoch': 2} {'type': 'loss', 'content': 0.0501207634806633, 'timestamp': '2025-10-01 04:12:09.149929', 'step': 922, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:09.217652', 'step': 922, 'epoch': 2} {'type': 'loss', 'content': 0.02210090495646, 'timestamp': '2025-10-01 04:12:09.221052', 'step': 923, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:09.287013', 'step': 923, 'epoch': 2} {'type': 'loss', 'content': 0.04293939843773842, 'timestamp': '2025-10-01 04:12:09.318752', 'step': 924, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:09.365635', 'step': 924, 'epoch': 2} {'type': 'loss', 'content': 0.0104265958070755, 'timestamp': '2025-10-01 04:12:09.368892', 'step': 925, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:09.408790', 'step': 925, 'epoch': 2} {'type': 'loss', 'content': 0.00624179607257247, 'timestamp': '2025-10-01 04:12:09.415409', 'step': 926, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:09.465406', 'step': 926, 'epoch': 2} {'type': 'loss', 'content': 0.02589820884168148, 'timestamp': '2025-10-01 04:12:09.472234', 'step': 927, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:09.513580', 'step': 927, 'epoch': 2} {'type': 'loss', 'content': 0.020178841426968575, 'timestamp': '2025-10-01 04:12:09.540253', 'step': 928, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:09.579214', 'step': 928, 'epoch': 2} {'type': 'loss', 'content': 0.01020850706845522, 'timestamp': '2025-10-01 04:12:09.585758', 'step': 929, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:09.624660', 'step': 929, 'epoch': 2} {'type': 'loss', 'content': 0.015341407619416714, 'timestamp': '2025-10-01 04:12:09.629545', 'step': 930, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:09.667253', 'step': 930, 'epoch': 2} {'type': 'loss', 'content': 0.021973326802253723, 'timestamp': '2025-10-01 04:12:09.671565', 'step': 931, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:09.712506', 'step': 931, 'epoch': 2} {'type': 'loss', 'content': 0.025720873847603798, 'timestamp': '2025-10-01 04:12:09.740189', 'step': 932, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:09.778019', 'step': 932, 'epoch': 2} {'type': 'loss', 'content': 0.018589098006486893, 'timestamp': '2025-10-01 04:12:09.785840', 'step': 933, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:09.825380', 'step': 933, 'epoch': 2} {'type': 'loss', 'content': 0.025446845218539238, 'timestamp': '2025-10-01 04:12:09.830555', 'step': 934, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:09.878671', 'step': 934, 'epoch': 2} {'type': 'loss', 'content': 0.014553597196936607, 'timestamp': '2025-10-01 04:12:09.882659', 'step': 935, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:09.930777', 'step': 935, 'epoch': 2} {'type': 'loss', 'content': 0.021716592833399773, 'timestamp': '2025-10-01 04:12:09.955352', 'step': 936, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:09.999942', 'step': 936, 'epoch': 2} {'type': 'loss', 'content': 0.01763700507581234, 'timestamp': '2025-10-01 04:12:10.005921', 'step': 937, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:10.063041', 'step': 937, 'epoch': 2} {'type': 'loss', 'content': 0.020768333226442337, 'timestamp': '2025-10-01 04:12:10.069442', 'step': 938, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:10.110045', 'step': 938, 'epoch': 2} {'type': 'loss', 'content': 0.02879549190402031, 'timestamp': '2025-10-01 04:12:10.115375', 'step': 939, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:10.163679', 'step': 939, 'epoch': 2} {'type': 'loss', 'content': 0.018444154411554337, 'timestamp': '2025-10-01 04:12:10.191711', 'step': 940, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:10.229687', 'step': 940, 'epoch': 2} {'type': 'loss', 'content': 0.024395182728767395, 'timestamp': '2025-10-01 04:12:10.233015', 'step': 941, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:10.269980', 'step': 941, 'epoch': 2} {'type': 'loss', 'content': 0.01769259199500084, 'timestamp': '2025-10-01 04:12:10.276898', 'step': 942, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:10.321030', 'step': 942, 'epoch': 2} {'type': 'loss', 'content': 0.01453376840800047, 'timestamp': '2025-10-01 04:12:10.325942', 'step': 943, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:10.367664', 'step': 943, 'epoch': 2} {'type': 'loss', 'content': 0.015906628221273422, 'timestamp': '2025-10-01 04:12:10.396363', 'step': 944, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:10.442167', 'step': 944, 'epoch': 2} {'type': 'loss', 'content': 0.02611907199025154, 'timestamp': '2025-10-01 04:12:10.447021', 'step': 945, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:10.487681', 'step': 945, 'epoch': 2} {'type': 'loss', 'content': 0.019764339551329613, 'timestamp': '2025-10-01 04:12:10.497805', 'step': 946, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:10.545114', 'step': 946, 'epoch': 2} {'type': 'loss', 'content': 0.019637709483504295, 'timestamp': '2025-10-01 04:12:10.554892', 'step': 947, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:10.607893', 'step': 947, 'epoch': 2} {'type': 'loss', 'content': 0.02475491352379322, 'timestamp': '2025-10-01 04:12:10.634591', 'step': 948, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:10.678263', 'step': 948, 'epoch': 2} {'type': 'loss', 'content': 0.021814698353409767, 'timestamp': '2025-10-01 04:12:10.684638', 'step': 949, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:10.723482', 'step': 949, 'epoch': 2} {'type': 'loss', 'content': 0.013068260625004768, 'timestamp': '2025-10-01 04:12:10.727501', 'step': 950, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:10.769434', 'step': 950, 'epoch': 2} {'type': 'loss', 'content': 0.028850361704826355, 'timestamp': '2025-10-01 04:12:10.775430', 'step': 951, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:10.813547', 'step': 951, 'epoch': 2} {'type': 'loss', 'content': 0.012638353742659092, 'timestamp': '2025-10-01 04:12:10.838855', 'step': 952, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:10.873276', 'step': 952, 'epoch': 2} {'type': 'loss', 'content': 0.018763437867164612, 'timestamp': '2025-10-01 04:12:10.878425', 'step': 953, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:10.920391', 'step': 953, 'epoch': 2} {'type': 'loss', 'content': 0.01879069395363331, 'timestamp': '2025-10-01 04:12:10.925301', 'step': 954, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:10.963130', 'step': 954, 'epoch': 2} {'type': 'loss', 'content': 0.015433231368660927, 'timestamp': '2025-10-01 04:12:10.966266', 'step': 955, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:11.001795', 'step': 955, 'epoch': 2} {'type': 'loss', 'content': 0.010517173446714878, 'timestamp': '2025-10-01 04:12:11.026780', 'step': 956, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:11.061905', 'step': 956, 'epoch': 2} {'type': 'loss', 'content': 0.01133835967630148, 'timestamp': '2025-10-01 04:12:11.066384', 'step': 957, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:11.101600', 'step': 957, 'epoch': 2} {'type': 'loss', 'content': 0.006800402887165546, 'timestamp': '2025-10-01 04:12:11.108591', 'step': 958, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:11.147042', 'step': 958, 'epoch': 2} {'type': 'loss', 'content': 0.01833231933414936, 'timestamp': '2025-10-01 04:12:11.150072', 'step': 959, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:11.183028', 'step': 959, 'epoch': 2} {'type': 'loss', 'content': 0.019620051607489586, 'timestamp': '2025-10-01 04:12:11.207303', 'step': 960, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:11.241080', 'step': 960, 'epoch': 2} {'type': 'loss', 'content': 0.0022921969648450613, 'timestamp': '2025-10-01 04:12:11.243564', 'step': 961, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:11.275719', 'step': 961, 'epoch': 2} {'type': 'loss', 'content': 0.0034712564665824175, 'timestamp': '2025-10-01 04:12:11.280361', 'step': 962, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:11.314318', 'step': 962, 'epoch': 2} {'type': 'loss', 'content': 0.003425298258662224, 'timestamp': '2025-10-01 04:12:11.319283', 'step': 963, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:11.359994', 'step': 963, 'epoch': 2} {'type': 'loss', 'content': 0.021860791370272636, 'timestamp': '2025-10-01 04:12:11.386160', 'step': 964, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:11.418662', 'step': 964, 'epoch': 2} {'type': 'loss', 'content': 0.009410454891622066, 'timestamp': '2025-10-01 04:12:11.421491', 'step': 965, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:11.455087', 'step': 965, 'epoch': 2} {'type': 'loss', 'content': 0.0017230076482519507, 'timestamp': '2025-10-01 04:12:11.465445', 'step': 966, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:11.508021', 'step': 966, 'epoch': 2} {'type': 'loss', 'content': 0.030510956421494484, 'timestamp': '2025-10-01 04:12:11.516776', 'step': 967, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:11.579721', 'step': 967, 'epoch': 2} {'type': 'loss', 'content': 0.0029920325614511967, 'timestamp': '2025-10-01 04:12:11.611391', 'step': 968, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:11.666162', 'step': 968, 'epoch': 2} {'type': 'loss', 'content': 0.04396705701947212, 'timestamp': '2025-10-01 04:12:11.676197', 'step': 969, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:12:13.097572', 'step': 969, 'epoch': 2} {'type': 'pplx', 'content': 81072435.35475479, 'timestamp': '2025-10-01 04:12:13.100930', 'step': 969, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:13.134354', 'step': 969, 'epoch': 2} {'type': 'loss', 'content': 0.03889823704957962, 'timestamp': '2025-10-01 04:12:13.136423', 'step': 970, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:13.169704', 'step': 970, 'epoch': 2} {'type': 'loss', 'content': 0.03967985138297081, 'timestamp': '2025-10-01 04:12:13.173045', 'step': 971, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:13.206990', 'step': 971, 'epoch': 2} {'type': 'loss', 'content': 0.019627872854471207, 'timestamp': '2025-10-01 04:12:13.230581', 'step': 972, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:13.267976', 'step': 972, 'epoch': 2} {'type': 'loss', 'content': 0.03085772506892681, 'timestamp': '2025-10-01 04:12:13.271186', 'step': 973, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:13.304172', 'step': 973, 'epoch': 2} {'type': 'loss', 'content': 0.0168258436024189, 'timestamp': '2025-10-01 04:12:13.310505', 'step': 974, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:13.348723', 'step': 974, 'epoch': 2} {'type': 'loss', 'content': 0.018279071897268295, 'timestamp': '2025-10-01 04:12:13.354676', 'step': 975, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:13.396816', 'step': 975, 'epoch': 2} {'type': 'loss', 'content': 0.024387961253523827, 'timestamp': '2025-10-01 04:12:13.425656', 'step': 976, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:13.464578', 'step': 976, 'epoch': 2} {'type': 'loss', 'content': 0.029510188847780228, 'timestamp': '2025-10-01 04:12:13.469101', 'step': 977, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:13.504809', 'step': 977, 'epoch': 2} {'type': 'loss', 'content': 0.011254996992647648, 'timestamp': '2025-10-01 04:12:13.512358', 'step': 978, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:13.556342', 'step': 978, 'epoch': 2} {'type': 'loss', 'content': 0.02034107968211174, 'timestamp': '2025-10-01 04:12:13.562588', 'step': 979, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:13.607539', 'step': 979, 'epoch': 2} {'type': 'loss', 'content': 0.0030284065287560225, 'timestamp': '2025-10-01 04:12:13.636953', 'step': 980, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:13.676325', 'step': 980, 'epoch': 2} {'type': 'loss', 'content': 0.004054553806781769, 'timestamp': '2025-10-01 04:12:13.681705', 'step': 981, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:13.718878', 'step': 981, 'epoch': 2} {'type': 'loss', 'content': 0.0036088896449655294, 'timestamp': '2025-10-01 04:12:13.723673', 'step': 982, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:13.763124', 'step': 982, 'epoch': 2} {'type': 'loss', 'content': 0.03551328182220459, 'timestamp': '2025-10-01 04:12:13.771368', 'step': 983, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:13.815265', 'step': 983, 'epoch': 2} {'type': 'loss', 'content': 0.004522853996604681, 'timestamp': '2025-10-01 04:12:13.841402', 'step': 984, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:13.887015', 'step': 984, 'epoch': 2} {'type': 'loss', 'content': 0.022567346692085266, 'timestamp': '2025-10-01 04:12:13.895150', 'step': 985, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:13.936009', 'step': 985, 'epoch': 2} {'type': 'loss', 'content': 0.026564184576272964, 'timestamp': '2025-10-01 04:12:13.939063', 'step': 986, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:13.981348', 'step': 986, 'epoch': 2} {'type': 'loss', 'content': 0.02584756910800934, 'timestamp': '2025-10-01 04:12:13.987136', 'step': 987, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:14.033434', 'step': 987, 'epoch': 2} {'type': 'loss', 'content': 0.003964011557400227, 'timestamp': '2025-10-01 04:12:14.057717', 'step': 988, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:14.104433', 'step': 988, 'epoch': 2} {'type': 'loss', 'content': 0.03352031856775284, 'timestamp': '2025-10-01 04:12:14.110344', 'step': 989, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:14.153205', 'step': 989, 'epoch': 2} {'type': 'loss', 'content': 0.0071314736269414425, 'timestamp': '2025-10-01 04:12:14.156565', 'step': 990, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:14.189427', 'step': 990, 'epoch': 2} {'type': 'loss', 'content': 0.03919211030006409, 'timestamp': '2025-10-01 04:12:14.193085', 'step': 991, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:14.226823', 'step': 991, 'epoch': 2} {'type': 'loss', 'content': 0.018441015854477882, 'timestamp': '2025-10-01 04:12:14.252171', 'step': 992, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:14.286405', 'step': 992, 'epoch': 2} {'type': 'loss', 'content': 0.02514655515551567, 'timestamp': '2025-10-01 04:12:14.290860', 'step': 993, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:14.323829', 'step': 993, 'epoch': 2} {'type': 'loss', 'content': 0.022655008360743523, 'timestamp': '2025-10-01 04:12:14.326870', 'step': 994, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:14.359838', 'step': 994, 'epoch': 2} {'type': 'loss', 'content': 0.018373850733041763, 'timestamp': '2025-10-01 04:12:14.362778', 'step': 995, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:14.396282', 'step': 995, 'epoch': 2} {'type': 'loss', 'content': 0.02451673150062561, 'timestamp': '2025-10-01 04:12:14.421074', 'step': 996, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:14.464882', 'step': 996, 'epoch': 2} {'type': 'loss', 'content': 0.018129874020814896, 'timestamp': '2025-10-01 04:12:14.472481', 'step': 997, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:14.517635', 'step': 997, 'epoch': 2} {'type': 'loss', 'content': 0.026043349876999855, 'timestamp': '2025-10-01 04:12:14.527495', 'step': 998, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:14.571039', 'step': 998, 'epoch': 2} {'type': 'loss', 'content': 0.025865105912089348, 'timestamp': '2025-10-01 04:12:14.578343', 'step': 999, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:14.625556', 'step': 999, 'epoch': 2} {'type': 'loss', 'content': 0.006189211271703243, 'timestamp': '2025-10-01 04:12:14.650728', 'step': 1000, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1000', 'timestamp': '2025-10-01 04:12:20.080288', 'step': 1000, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:20.130469', 'step': 1000, 'epoch': 2} {'type': 'loss', 'content': 0.025496182963252068, 'timestamp': '2025-10-01 04:12:20.139070', 'step': 1001, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:20.186396', 'step': 1001, 'epoch': 2} {'type': 'loss', 'content': 0.019999513402581215, 'timestamp': '2025-10-01 04:12:20.192583', 'step': 1002, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:20.235175', 'step': 1002, 'epoch': 2} {'type': 'loss', 'content': 0.02922123670578003, 'timestamp': '2025-10-01 04:12:20.242913', 'step': 1003, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:20.290618', 'step': 1003, 'epoch': 2} {'type': 'loss', 'content': 0.011163772083818913, 'timestamp': '2025-10-01 04:12:20.320274', 'step': 1004, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:20.366077', 'step': 1004, 'epoch': 2} {'type': 'loss', 'content': 0.019625648856163025, 'timestamp': '2025-10-01 04:12:20.373317', 'step': 1005, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:20.426353', 'step': 1005, 'epoch': 2} {'type': 'loss', 'content': 0.026116380468010902, 'timestamp': '2025-10-01 04:12:20.438592', 'step': 1006, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:20.490040', 'step': 1006, 'epoch': 2} {'type': 'loss', 'content': 0.022429784759879112, 'timestamp': '2025-10-01 04:12:20.503098', 'step': 1007, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:20.550783', 'step': 1007, 'epoch': 2} {'type': 'loss', 'content': 0.03787512332201004, 'timestamp': '2025-10-01 04:12:20.575550', 'step': 1008, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:20.619300', 'step': 1008, 'epoch': 2} {'type': 'loss', 'content': 0.02521323598921299, 'timestamp': '2025-10-01 04:12:20.625053', 'step': 1009, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:20.668753', 'step': 1009, 'epoch': 2} {'type': 'loss', 'content': 0.01483877096325159, 'timestamp': '2025-10-01 04:12:20.677295', 'step': 1010, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:20.718640', 'step': 1010, 'epoch': 2} {'type': 'loss', 'content': 0.010802079923450947, 'timestamp': '2025-10-01 04:12:20.730255', 'step': 1011, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:20.781829', 'step': 1011, 'epoch': 2} {'type': 'loss', 'content': 0.02756008878350258, 'timestamp': '2025-10-01 04:12:20.811183', 'step': 1012, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:20.859332', 'step': 1012, 'epoch': 2} {'type': 'loss', 'content': 0.04122751206159592, 'timestamp': '2025-10-01 04:12:20.872435', 'step': 1013, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:20.925042', 'step': 1013, 'epoch': 2} {'type': 'loss', 'content': 0.021239006891846657, 'timestamp': '2025-10-01 04:12:20.937739', 'step': 1014, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:20.989925', 'step': 1014, 'epoch': 2} {'type': 'loss', 'content': 0.01687648706138134, 'timestamp': '2025-10-01 04:12:21.001251', 'step': 1015, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:21.062187', 'step': 1015, 'epoch': 2} {'type': 'loss', 'content': 0.00788965355604887, 'timestamp': '2025-10-01 04:12:21.094134', 'step': 1016, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:21.148712', 'step': 1016, 'epoch': 2} {'type': 'loss', 'content': 0.02206888608634472, 'timestamp': '2025-10-01 04:12:21.152178', 'step': 1017, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:21.201815', 'step': 1017, 'epoch': 2} {'type': 'loss', 'content': 0.021229689940810204, 'timestamp': '2025-10-01 04:12:21.214308', 'step': 1018, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:21.265543', 'step': 1018, 'epoch': 2} {'type': 'loss', 'content': 0.011558206751942635, 'timestamp': '2025-10-01 04:12:21.274292', 'step': 1019, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:21.323819', 'step': 1019, 'epoch': 2} {'type': 'loss', 'content': 0.019588660448789597, 'timestamp': '2025-10-01 04:12:21.360075', 'step': 1020, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:21.410793', 'step': 1020, 'epoch': 2} {'type': 'loss', 'content': 0.028030814602971077, 'timestamp': '2025-10-01 04:12:21.424067', 'step': 1021, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:21.498520', 'step': 1021, 'epoch': 2} {'type': 'loss', 'content': 0.03801051899790764, 'timestamp': '2025-10-01 04:12:21.507802', 'step': 1022, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:21.560572', 'step': 1022, 'epoch': 2} {'type': 'loss', 'content': 0.002824042458087206, 'timestamp': '2025-10-01 04:12:21.568695', 'step': 1023, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:21.621389', 'step': 1023, 'epoch': 2} {'type': 'loss', 'content': 0.011485311202704906, 'timestamp': '2025-10-01 04:12:21.653646', 'step': 1024, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:21.702889', 'step': 1024, 'epoch': 2} {'type': 'loss', 'content': 0.04113561287522316, 'timestamp': '2025-10-01 04:12:21.710504', 'step': 1025, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:21.761086', 'step': 1025, 'epoch': 2} {'type': 'loss', 'content': 0.05204043537378311, 'timestamp': '2025-10-01 04:12:21.770999', 'step': 1026, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:12:23.230234', 'step': 1026, 'epoch': 2} {'type': 'pplx', 'content': 74534656.95469436, 'timestamp': '2025-10-01 04:12:23.241058', 'step': 1026, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:12:23.282490', 'step': 1026, 'epoch': 2} {'type': 'loss', 'content': 0.060518063604831696, 'timestamp': '2025-10-01 04:12:23.286656', 'step': 1027, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:23.319127', 'step': 1027, 'epoch': 2} {'type': 'loss', 'content': 0.003438900923356414, 'timestamp': '2025-10-01 04:12:23.357621', 'step': 1028, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:23.402497', 'step': 1028, 'epoch': 2} {'type': 'loss', 'content': 0.016376933082938194, 'timestamp': '2025-10-01 04:12:23.414993', 'step': 1029, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:23.463369', 'step': 1029, 'epoch': 2} {'type': 'loss', 'content': 0.01734158955514431, 'timestamp': '2025-10-01 04:12:23.477727', 'step': 1030, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:23.535829', 'step': 1030, 'epoch': 2} {'type': 'loss', 'content': 0.013855542056262493, 'timestamp': '2025-10-01 04:12:23.547595', 'step': 1031, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:23.607960', 'step': 1031, 'epoch': 2} {'type': 'loss', 'content': 0.003171414602547884, 'timestamp': '2025-10-01 04:12:23.641584', 'step': 1032, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:23.703210', 'step': 1032, 'epoch': 2} {'type': 'loss', 'content': 0.01719476841390133, 'timestamp': '2025-10-01 04:12:23.714948', 'step': 1033, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:23.768955', 'step': 1033, 'epoch': 2} {'type': 'loss', 'content': 0.017137866467237473, 'timestamp': '2025-10-01 04:12:23.772655', 'step': 1034, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:23.811091', 'step': 1034, 'epoch': 2} {'type': 'loss', 'content': 0.01853066310286522, 'timestamp': '2025-10-01 04:12:23.821087', 'step': 1035, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:23.872087', 'step': 1035, 'epoch': 2} {'type': 'loss', 'content': 0.005616360809653997, 'timestamp': '2025-10-01 04:12:23.903414', 'step': 1036, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:23.955855', 'step': 1036, 'epoch': 2} {'type': 'loss', 'content': 0.008777479641139507, 'timestamp': '2025-10-01 04:12:23.959796', 'step': 1037, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:24.008614', 'step': 1037, 'epoch': 2} {'type': 'loss', 'content': 0.0419398732483387, 'timestamp': '2025-10-01 04:12:24.018868', 'step': 1038, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:24.068646', 'step': 1038, 'epoch': 2} {'type': 'loss', 'content': 0.01611681841313839, 'timestamp': '2025-10-01 04:12:24.076779', 'step': 1039, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:24.129063', 'step': 1039, 'epoch': 2} {'type': 'loss', 'content': 0.027612056583166122, 'timestamp': '2025-10-01 04:12:24.158487', 'step': 1040, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:24.201230', 'step': 1040, 'epoch': 2} {'type': 'loss', 'content': 0.014357500709593296, 'timestamp': '2025-10-01 04:12:24.207431', 'step': 1041, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:24.258529', 'step': 1041, 'epoch': 2} {'type': 'loss', 'content': 0.009304146282374859, 'timestamp': '2025-10-01 04:12:24.267547', 'step': 1042, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:24.312047', 'step': 1042, 'epoch': 2} {'type': 'loss', 'content': 0.03618435934185982, 'timestamp': '2025-10-01 04:12:24.316876', 'step': 1043, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:24.363088', 'step': 1043, 'epoch': 2} {'type': 'loss', 'content': 0.012262886390089989, 'timestamp': '2025-10-01 04:12:24.387460', 'step': 1044, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:24.435236', 'step': 1044, 'epoch': 2} {'type': 'loss', 'content': 0.007298172917217016, 'timestamp': '2025-10-01 04:12:24.443779', 'step': 1045, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:24.502207', 'step': 1045, 'epoch': 2} {'type': 'loss', 'content': 0.012754367664456367, 'timestamp': '2025-10-01 04:12:24.508197', 'step': 1046, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:24.556287', 'step': 1046, 'epoch': 2} {'type': 'loss', 'content': 0.03507836535573006, 'timestamp': '2025-10-01 04:12:24.566906', 'step': 1047, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:24.624464', 'step': 1047, 'epoch': 2} {'type': 'loss', 'content': 0.012025631964206696, 'timestamp': '2025-10-01 04:12:24.655464', 'step': 1048, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:24.710791', 'step': 1048, 'epoch': 2} {'type': 'loss', 'content': 0.009308640845119953, 'timestamp': '2025-10-01 04:12:24.720986', 'step': 1049, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:24.767803', 'step': 1049, 'epoch': 2} {'type': 'loss', 'content': 0.014288820326328278, 'timestamp': '2025-10-01 04:12:24.775169', 'step': 1050, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:24.821807', 'step': 1050, 'epoch': 2} {'type': 'loss', 'content': 0.02874881587922573, 'timestamp': '2025-10-01 04:12:24.829322', 'step': 1051, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:24.879997', 'step': 1051, 'epoch': 2} {'type': 'loss', 'content': 0.011433676816523075, 'timestamp': '2025-10-01 04:12:24.908934', 'step': 1052, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:24.957889', 'step': 1052, 'epoch': 2} {'type': 'loss', 'content': 0.01906905137002468, 'timestamp': '2025-10-01 04:12:24.961964', 'step': 1053, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:25.013911', 'step': 1053, 'epoch': 2} {'type': 'loss', 'content': 0.03443703427910805, 'timestamp': '2025-10-01 04:12:25.022468', 'step': 1054, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:25.065818', 'step': 1054, 'epoch': 2} {'type': 'loss', 'content': 0.02929351106286049, 'timestamp': '2025-10-01 04:12:25.075300', 'step': 1055, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:25.127217', 'step': 1055, 'epoch': 2} {'type': 'loss', 'content': 0.03993804007768631, 'timestamp': '2025-10-01 04:12:25.156783', 'step': 1056, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:25.208876', 'step': 1056, 'epoch': 2} {'type': 'loss', 'content': 0.021182114258408546, 'timestamp': '2025-10-01 04:12:25.216179', 'step': 1057, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:25.262382', 'step': 1057, 'epoch': 2} {'type': 'loss', 'content': 0.024790504947304726, 'timestamp': '2025-10-01 04:12:25.265465', 'step': 1058, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:25.305899', 'step': 1058, 'epoch': 2} {'type': 'loss', 'content': 0.03713010996580124, 'timestamp': '2025-10-01 04:12:25.313698', 'step': 1059, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:25.360189', 'step': 1059, 'epoch': 2} {'type': 'loss', 'content': 0.01229268778115511, 'timestamp': '2025-10-01 04:12:25.389744', 'step': 1060, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:25.436339', 'step': 1060, 'epoch': 2} {'type': 'loss', 'content': 0.013694092631340027, 'timestamp': '2025-10-01 04:12:25.449497', 'step': 1061, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:25.506737', 'step': 1061, 'epoch': 2} {'type': 'loss', 'content': 0.01466763112694025, 'timestamp': '2025-10-01 04:12:25.514043', 'step': 1062, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:25.561668', 'step': 1062, 'epoch': 2} {'type': 'loss', 'content': 0.006644328590482473, 'timestamp': '2025-10-01 04:12:25.564531', 'step': 1063, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:25.614569', 'step': 1063, 'epoch': 2} {'type': 'loss', 'content': 0.009213857352733612, 'timestamp': '2025-10-01 04:12:25.648593', 'step': 1064, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:25.706214', 'step': 1064, 'epoch': 2} {'type': 'loss', 'content': 0.03488415107131004, 'timestamp': '2025-10-01 04:12:25.717488', 'step': 1065, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:25.782296', 'step': 1065, 'epoch': 2} {'type': 'loss', 'content': 0.02670764923095703, 'timestamp': '2025-10-01 04:12:25.794108', 'step': 1066, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:25.856101', 'step': 1066, 'epoch': 2} {'type': 'loss', 'content': 0.026423903182148933, 'timestamp': '2025-10-01 04:12:25.859779', 'step': 1067, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:25.902843', 'step': 1067, 'epoch': 2} {'type': 'loss', 'content': 0.012425941415131092, 'timestamp': '2025-10-01 04:12:25.938403', 'step': 1068, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:25.997755', 'step': 1068, 'epoch': 2} {'type': 'loss', 'content': 0.004534396808594465, 'timestamp': '2025-10-01 04:12:26.010122', 'step': 1069, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:26.070158', 'step': 1069, 'epoch': 2} {'type': 'loss', 'content': 0.012708894908428192, 'timestamp': '2025-10-01 04:12:26.083392', 'step': 1070, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:26.143118', 'step': 1070, 'epoch': 2} {'type': 'loss', 'content': 0.020166311413049698, 'timestamp': '2025-10-01 04:12:26.154906', 'step': 1071, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:26.213791', 'step': 1071, 'epoch': 2} {'type': 'loss', 'content': 0.026705464348196983, 'timestamp': '2025-10-01 04:12:26.246010', 'step': 1072, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:26.306666', 'step': 1072, 'epoch': 2} {'type': 'loss', 'content': 0.01093844510614872, 'timestamp': '2025-10-01 04:12:26.314580', 'step': 1073, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:26.365533', 'step': 1073, 'epoch': 2} {'type': 'loss', 'content': 0.04105237126350403, 'timestamp': '2025-10-01 04:12:26.374837', 'step': 1074, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:26.422483', 'step': 1074, 'epoch': 2} {'type': 'loss', 'content': 0.05717204138636589, 'timestamp': '2025-10-01 04:12:26.430254', 'step': 1075, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:26.478912', 'step': 1075, 'epoch': 2} {'type': 'loss', 'content': 0.04415814206004143, 'timestamp': '2025-10-01 04:12:26.509652', 'step': 1076, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:26.558489', 'step': 1076, 'epoch': 2} {'type': 'loss', 'content': 0.0289046261459589, 'timestamp': '2025-10-01 04:12:26.568501', 'step': 1077, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:26.619976', 'step': 1077, 'epoch': 2} {'type': 'loss', 'content': 0.022754548117518425, 'timestamp': '2025-10-01 04:12:26.629759', 'step': 1078, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:26.675199', 'step': 1078, 'epoch': 2} {'type': 'loss', 'content': 0.018774649128317833, 'timestamp': '2025-10-01 04:12:26.681482', 'step': 1079, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:26.731949', 'step': 1079, 'epoch': 2} {'type': 'loss', 'content': 0.01819383166730404, 'timestamp': '2025-10-01 04:12:26.759080', 'step': 1080, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:26.814991', 'step': 1080, 'epoch': 2} {'type': 'loss', 'content': 0.020636798813939095, 'timestamp': '2025-10-01 04:12:26.825870', 'step': 1081, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:26.880369', 'step': 1081, 'epoch': 2} {'type': 'loss', 'content': 0.02792542800307274, 'timestamp': '2025-10-01 04:12:26.890526', 'step': 1082, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:26.949859', 'step': 1082, 'epoch': 2} {'type': 'loss', 'content': 0.01891268976032734, 'timestamp': '2025-10-01 04:12:26.954700', 'step': 1083, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:12:28.175094', 'step': 1083, 'epoch': 2} {'type': 'pplx', 'content': 62566369.90442915, 'timestamp': '2025-10-01 04:12:28.182108', 'step': 1083, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:28.219753', 'step': 1083, 'epoch': 2} {'type': 'loss', 'content': 0.013113982044160366, 'timestamp': '2025-10-01 04:12:28.246671', 'step': 1084, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:28.300031', 'step': 1084, 'epoch': 2} {'type': 'loss', 'content': 0.006288694683462381, 'timestamp': '2025-10-01 04:12:28.306128', 'step': 1085, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:28.350994', 'step': 1085, 'epoch': 2} {'type': 'loss', 'content': 0.020121892914175987, 'timestamp': '2025-10-01 04:12:28.357824', 'step': 1086, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:28.398651', 'step': 1086, 'epoch': 2} {'type': 'loss', 'content': 0.027080422267317772, 'timestamp': '2025-10-01 04:12:28.404013', 'step': 1087, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:28.453476', 'step': 1087, 'epoch': 2} {'type': 'loss', 'content': 0.009238681755959988, 'timestamp': '2025-10-01 04:12:28.480708', 'step': 1088, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:28.525283', 'step': 1088, 'epoch': 2} {'type': 'loss', 'content': 0.033649567514657974, 'timestamp': '2025-10-01 04:12:28.532996', 'step': 1089, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:28.576999', 'step': 1089, 'epoch': 2} {'type': 'loss', 'content': 0.021768080070614815, 'timestamp': '2025-10-01 04:12:28.585337', 'step': 1090, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:28.628749', 'step': 1090, 'epoch': 2} {'type': 'loss', 'content': 0.04176913574337959, 'timestamp': '2025-10-01 04:12:28.633636', 'step': 1091, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:28.678793', 'step': 1091, 'epoch': 2} {'type': 'loss', 'content': 0.007122901733964682, 'timestamp': '2025-10-01 04:12:28.706112', 'step': 1092, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:28.742425', 'step': 1092, 'epoch': 2} {'type': 'loss', 'content': 0.017248356714844704, 'timestamp': '2025-10-01 04:12:28.746780', 'step': 1093, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:28.793813', 'step': 1093, 'epoch': 2} {'type': 'loss', 'content': 0.011409069411456585, 'timestamp': '2025-10-01 04:12:28.800770', 'step': 1094, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:28.844952', 'step': 1094, 'epoch': 2} {'type': 'loss', 'content': 0.010414226911962032, 'timestamp': '2025-10-01 04:12:28.848936', 'step': 1095, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:28.890495', 'step': 1095, 'epoch': 2} {'type': 'loss', 'content': 0.005965986289083958, 'timestamp': '2025-10-01 04:12:28.916790', 'step': 1096, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:28.961079', 'step': 1096, 'epoch': 2} {'type': 'loss', 'content': 0.019725624471902847, 'timestamp': '2025-10-01 04:12:28.965573', 'step': 1097, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:29.009299', 'step': 1097, 'epoch': 2} {'type': 'loss', 'content': 0.019195079803466797, 'timestamp': '2025-10-01 04:12:29.017144', 'step': 1098, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:29.062348', 'step': 1098, 'epoch': 2} {'type': 'loss', 'content': 0.020952671766281128, 'timestamp': '2025-10-01 04:12:29.068559', 'step': 1099, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:29.110221', 'step': 1099, 'epoch': 2} {'type': 'loss', 'content': 0.027296748012304306, 'timestamp': '2025-10-01 04:12:29.137134', 'step': 1100, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:29.179356', 'step': 1100, 'epoch': 2} {'type': 'loss', 'content': 0.02847483567893505, 'timestamp': '2025-10-01 04:12:29.183252', 'step': 1101, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:29.227678', 'step': 1101, 'epoch': 2} {'type': 'loss', 'content': 0.01081875804811716, 'timestamp': '2025-10-01 04:12:29.242281', 'step': 1102, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:29.318405', 'step': 1102, 'epoch': 2} {'type': 'loss', 'content': 0.021221021190285683, 'timestamp': '2025-10-01 04:12:29.329630', 'step': 1103, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:29.389413', 'step': 1103, 'epoch': 2} {'type': 'loss', 'content': 0.018782924860715866, 'timestamp': '2025-10-01 04:12:29.420176', 'step': 1104, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:29.466310', 'step': 1104, 'epoch': 2} {'type': 'loss', 'content': 0.012332765385508537, 'timestamp': '2025-10-01 04:12:29.476222', 'step': 1105, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:29.531474', 'step': 1105, 'epoch': 2} {'type': 'loss', 'content': 0.011492065154016018, 'timestamp': '2025-10-01 04:12:29.540729', 'step': 1106, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:29.585476', 'step': 1106, 'epoch': 2} {'type': 'loss', 'content': 0.025456393137574196, 'timestamp': '2025-10-01 04:12:29.590994', 'step': 1107, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:29.634583', 'step': 1107, 'epoch': 2} {'type': 'loss', 'content': 0.021532809361815453, 'timestamp': '2025-10-01 04:12:29.663363', 'step': 1108, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:29.713162', 'step': 1108, 'epoch': 2} {'type': 'loss', 'content': 0.0116585036739707, 'timestamp': '2025-10-01 04:12:29.721922', 'step': 1109, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:29.769213', 'step': 1109, 'epoch': 2} {'type': 'loss', 'content': 0.019244659692049026, 'timestamp': '2025-10-01 04:12:29.772276', 'step': 1110, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:29.819181', 'step': 1110, 'epoch': 2} {'type': 'loss', 'content': 0.019055215641856194, 'timestamp': '2025-10-01 04:12:29.825545', 'step': 1111, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:29.870125', 'step': 1111, 'epoch': 2} {'type': 'loss', 'content': 0.028476441279053688, 'timestamp': '2025-10-01 04:12:29.893903', 'step': 1112, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:29.945679', 'step': 1112, 'epoch': 2} {'type': 'loss', 'content': 0.022367192432284355, 'timestamp': '2025-10-01 04:12:29.953438', 'step': 1113, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:30.005605', 'step': 1113, 'epoch': 2} {'type': 'loss', 'content': 0.012778395786881447, 'timestamp': '2025-10-01 04:12:30.014956', 'step': 1114, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:30.062455', 'step': 1114, 'epoch': 2} {'type': 'loss', 'content': 0.02362431026995182, 'timestamp': '2025-10-01 04:12:30.069487', 'step': 1115, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:30.112843', 'step': 1115, 'epoch': 2} {'type': 'loss', 'content': 0.0172555074095726, 'timestamp': '2025-10-01 04:12:30.147296', 'step': 1116, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:30.184903', 'step': 1116, 'epoch': 2} {'type': 'loss', 'content': 0.026192547753453255, 'timestamp': '2025-10-01 04:12:30.192051', 'step': 1117, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:30.237473', 'step': 1117, 'epoch': 2} {'type': 'loss', 'content': 0.014290613122284412, 'timestamp': '2025-10-01 04:12:30.247066', 'step': 1118, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:30.294475', 'step': 1118, 'epoch': 2} {'type': 'loss', 'content': 0.021653365343809128, 'timestamp': '2025-10-01 04:12:30.303475', 'step': 1119, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:30.351658', 'step': 1119, 'epoch': 2} {'type': 'loss', 'content': 0.030385613441467285, 'timestamp': '2025-10-01 04:12:30.375922', 'step': 1120, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:30.423559', 'step': 1120, 'epoch': 2} {'type': 'loss', 'content': 0.02170766517519951, 'timestamp': '2025-10-01 04:12:30.431657', 'step': 1121, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:30.477731', 'step': 1121, 'epoch': 2} {'type': 'loss', 'content': 0.010667859576642513, 'timestamp': '2025-10-01 04:12:30.486076', 'step': 1122, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:30.528274', 'step': 1122, 'epoch': 2} {'type': 'loss', 'content': 0.014071441255509853, 'timestamp': '2025-10-01 04:12:30.535707', 'step': 1123, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:30.580818', 'step': 1123, 'epoch': 2} {'type': 'loss', 'content': 0.014766133390367031, 'timestamp': '2025-10-01 04:12:30.606708', 'step': 1124, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:30.664808', 'step': 1124, 'epoch': 2} {'type': 'loss', 'content': 0.007835105992853642, 'timestamp': '2025-10-01 04:12:30.670811', 'step': 1125, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:30.711157', 'step': 1125, 'epoch': 2} {'type': 'loss', 'content': 0.020053701475262642, 'timestamp': '2025-10-01 04:12:30.715814', 'step': 1126, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:30.754530', 'step': 1126, 'epoch': 2} {'type': 'loss', 'content': 0.025708338245749474, 'timestamp': '2025-10-01 04:12:30.758579', 'step': 1127, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:30.798065', 'step': 1127, 'epoch': 2} {'type': 'loss', 'content': 0.007576673291623592, 'timestamp': '2025-10-01 04:12:30.823696', 'step': 1128, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:30.861459', 'step': 1128, 'epoch': 2} {'type': 'loss', 'content': 0.042808592319488525, 'timestamp': '2025-10-01 04:12:30.866053', 'step': 1129, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:30.906239', 'step': 1129, 'epoch': 2} {'type': 'loss', 'content': 0.032328877598047256, 'timestamp': '2025-10-01 04:12:30.910682', 'step': 1130, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:30.947319', 'step': 1130, 'epoch': 2} {'type': 'loss', 'content': 0.015181933529675007, 'timestamp': '2025-10-01 04:12:30.952193', 'step': 1131, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:30.992184', 'step': 1131, 'epoch': 2} {'type': 'loss', 'content': 0.010932973586022854, 'timestamp': '2025-10-01 04:12:31.017964', 'step': 1132, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:31.056055', 'step': 1132, 'epoch': 2} {'type': 'loss', 'content': 0.019718725234270096, 'timestamp': '2025-10-01 04:12:31.061331', 'step': 1133, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:31.104264', 'step': 1133, 'epoch': 2} {'type': 'loss', 'content': 0.016067685559391975, 'timestamp': '2025-10-01 04:12:31.109066', 'step': 1134, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:31.149572', 'step': 1134, 'epoch': 2} {'type': 'loss', 'content': 0.02257952094078064, 'timestamp': '2025-10-01 04:12:31.157413', 'step': 1135, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:31.203158', 'step': 1135, 'epoch': 2} {'type': 'loss', 'content': 0.011983809992671013, 'timestamp': '2025-10-01 04:12:31.231501', 'step': 1136, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:31.273413', 'step': 1136, 'epoch': 2} {'type': 'loss', 'content': 0.007152553182095289, 'timestamp': '2025-10-01 04:12:31.276438', 'step': 1137, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:31.318357', 'step': 1137, 'epoch': 2} {'type': 'loss', 'content': 0.018866227939724922, 'timestamp': '2025-10-01 04:12:31.325994', 'step': 1138, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:31.368558', 'step': 1138, 'epoch': 2} {'type': 'loss', 'content': 0.011750993318855762, 'timestamp': '2025-10-01 04:12:31.371822', 'step': 1139, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:31.411580', 'step': 1139, 'epoch': 2} {'type': 'loss', 'content': 0.01951892487704754, 'timestamp': '2025-10-01 04:12:31.439193', 'step': 1140, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:12:32.584175', 'step': 1140, 'epoch': 2} {'type': 'pplx', 'content': 62764748.542011686, 'timestamp': '2025-10-01 04:12:32.588321', 'step': 1140, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:32.624205', 'step': 1140, 'epoch': 2} {'type': 'loss', 'content': 0.029061226174235344, 'timestamp': '2025-10-01 04:12:32.629949', 'step': 1141, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:32.673699', 'step': 1141, 'epoch': 2} {'type': 'loss', 'content': 0.00953701976686716, 'timestamp': '2025-10-01 04:12:32.676602', 'step': 1142, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:32.719807', 'step': 1142, 'epoch': 2} {'type': 'loss', 'content': 0.01728624477982521, 'timestamp': '2025-10-01 04:12:32.727261', 'step': 1143, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:32.772725', 'step': 1143, 'epoch': 2} {'type': 'loss', 'content': 0.033103007823228836, 'timestamp': '2025-10-01 04:12:32.801245', 'step': 1144, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:32.844340', 'step': 1144, 'epoch': 2} {'type': 'loss', 'content': 0.004969587083905935, 'timestamp': '2025-10-01 04:12:32.850720', 'step': 1145, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:32.893335', 'step': 1145, 'epoch': 2} {'type': 'loss', 'content': 0.020533567294478416, 'timestamp': '2025-10-01 04:12:32.899869', 'step': 1146, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:32.950253', 'step': 1146, 'epoch': 2} {'type': 'loss', 'content': 0.010829508304595947, 'timestamp': '2025-10-01 04:12:32.953914', 'step': 1147, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:32.995169', 'step': 1147, 'epoch': 2} {'type': 'loss', 'content': 0.014489369466900826, 'timestamp': '2025-10-01 04:12:33.025395', 'step': 1148, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:33.073039', 'step': 1148, 'epoch': 2} {'type': 'loss', 'content': 0.02824089117348194, 'timestamp': '2025-10-01 04:12:33.077672', 'step': 1149, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:33.120786', 'step': 1149, 'epoch': 2} {'type': 'loss', 'content': 0.012532351538538933, 'timestamp': '2025-10-01 04:12:33.127462', 'step': 1150, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:33.172186', 'step': 1150, 'epoch': 2} {'type': 'loss', 'content': 0.03572159260511398, 'timestamp': '2025-10-01 04:12:33.175873', 'step': 1151, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:33.215618', 'step': 1151, 'epoch': 2} {'type': 'loss', 'content': 0.010852976702153683, 'timestamp': '2025-10-01 04:12:33.245624', 'step': 1152, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:33.291717', 'step': 1152, 'epoch': 2} {'type': 'loss', 'content': 0.012694260105490685, 'timestamp': '2025-10-01 04:12:33.301775', 'step': 1153, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:33.335968', 'step': 1153, 'epoch': 2} {'type': 'loss', 'content': 0.011462218128144741, 'timestamp': '2025-10-01 04:12:33.339956', 'step': 1154, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:33.371787', 'step': 1154, 'epoch': 2} {'type': 'loss', 'content': 0.02176797203719616, 'timestamp': '2025-10-01 04:12:33.374914', 'step': 1155, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:33.406083', 'step': 1155, 'epoch': 2} {'type': 'loss', 'content': 0.023000076413154602, 'timestamp': '2025-10-01 04:12:33.431149', 'step': 1156, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:33.462980', 'step': 1156, 'epoch': 2} {'type': 'loss', 'content': 0.013469113036990166, 'timestamp': '2025-10-01 04:12:33.465651', 'step': 1157, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:33.497278', 'step': 1157, 'epoch': 2} {'type': 'loss', 'content': 0.011314094997942448, 'timestamp': '2025-10-01 04:12:33.499759', 'step': 1158, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:33.531807', 'step': 1158, 'epoch': 2} {'type': 'loss', 'content': 0.013132537715137005, 'timestamp': '2025-10-01 04:12:33.534294', 'step': 1159, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:33.568740', 'step': 1159, 'epoch': 2} {'type': 'loss', 'content': 0.007822925224900246, 'timestamp': '2025-10-01 04:12:33.592753', 'step': 1160, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:33.624535', 'step': 1160, 'epoch': 2} {'type': 'loss', 'content': 0.007950437255203724, 'timestamp': '2025-10-01 04:12:33.627429', 'step': 1161, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:33.659359', 'step': 1161, 'epoch': 2} {'type': 'loss', 'content': 0.023385372012853622, 'timestamp': '2025-10-01 04:12:33.663040', 'step': 1162, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:33.696229', 'step': 1162, 'epoch': 2} {'type': 'loss', 'content': 0.01512717455625534, 'timestamp': '2025-10-01 04:12:33.698622', 'step': 1163, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:33.729740', 'step': 1163, 'epoch': 2} {'type': 'loss', 'content': 0.022730054333806038, 'timestamp': '2025-10-01 04:12:33.754084', 'step': 1164, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:33.785960', 'step': 1164, 'epoch': 2} {'type': 'loss', 'content': 0.016662919893860817, 'timestamp': '2025-10-01 04:12:33.788657', 'step': 1165, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:33.821178', 'step': 1165, 'epoch': 2} {'type': 'loss', 'content': 0.01865740306675434, 'timestamp': '2025-10-01 04:12:33.823887', 'step': 1166, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:33.855044', 'step': 1166, 'epoch': 2} {'type': 'loss', 'content': 0.010064325295388699, 'timestamp': '2025-10-01 04:12:33.857789', 'step': 1167, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:33.889101', 'step': 1167, 'epoch': 2} {'type': 'loss', 'content': 0.03933363035321236, 'timestamp': '2025-10-01 04:12:33.913305', 'step': 1168, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:33.946332', 'step': 1168, 'epoch': 2} {'type': 'loss', 'content': 0.01619604602456093, 'timestamp': '2025-10-01 04:12:33.949401', 'step': 1169, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:33.981698', 'step': 1169, 'epoch': 2} {'type': 'loss', 'content': 0.008631623350083828, 'timestamp': '2025-10-01 04:12:33.984775', 'step': 1170, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:34.016928', 'step': 1170, 'epoch': 2} {'type': 'loss', 'content': 0.01610485464334488, 'timestamp': '2025-10-01 04:12:34.020058', 'step': 1171, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:34.054794', 'step': 1171, 'epoch': 2} {'type': 'loss', 'content': 0.008041969500482082, 'timestamp': '2025-10-01 04:12:34.078747', 'step': 1172, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:12:34.111508', 'step': 1172, 'epoch': 2} {'type': 'loss', 'content': 0.012203284539282322, 'timestamp': '2025-10-01 04:12:34.114186', 'step': 1173, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:34.146232', 'step': 1173, 'epoch': 2} {'type': 'loss', 'content': 0.014361986890435219, 'timestamp': '2025-10-01 04:12:34.148827', 'step': 1174, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:34.181487', 'step': 1174, 'epoch': 2} {'type': 'loss', 'content': 0.01781057007610798, 'timestamp': '2025-10-01 04:12:34.183913', 'step': 1175, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:34.217961', 'step': 1175, 'epoch': 2} {'type': 'loss', 'content': 0.009551103226840496, 'timestamp': '2025-10-01 04:12:34.242205', 'step': 1176, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:34.274120', 'step': 1176, 'epoch': 2} {'type': 'loss', 'content': 0.029874419793486595, 'timestamp': '2025-10-01 04:12:34.278147', 'step': 1177, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:34.309583', 'step': 1177, 'epoch': 2} {'type': 'loss', 'content': 0.015719091519713402, 'timestamp': '2025-10-01 04:12:34.312253', 'step': 1178, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:34.343774', 'step': 1178, 'epoch': 2} {'type': 'loss', 'content': 0.027939843013882637, 'timestamp': '2025-10-01 04:12:34.346675', 'step': 1179, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:34.380926', 'step': 1179, 'epoch': 2} {'type': 'loss', 'content': 0.01609652303159237, 'timestamp': '2025-10-01 04:12:34.404946', 'step': 1180, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:34.442749', 'step': 1180, 'epoch': 2} {'type': 'loss', 'content': 0.030004026368260384, 'timestamp': '2025-10-01 04:12:34.445012', 'step': 1181, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:34.480249', 'step': 1181, 'epoch': 2} {'type': 'loss', 'content': 0.03301515057682991, 'timestamp': '2025-10-01 04:12:34.483656', 'step': 1182, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:34.524031', 'step': 1182, 'epoch': 2} {'type': 'loss', 'content': 0.01641576923429966, 'timestamp': '2025-10-01 04:12:34.526590', 'step': 1183, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:34.559718', 'step': 1183, 'epoch': 2} {'type': 'loss', 'content': 0.006728612817823887, 'timestamp': '2025-10-01 04:12:34.583719', 'step': 1184, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:34.614912', 'step': 1184, 'epoch': 2} {'type': 'loss', 'content': 0.021417854353785515, 'timestamp': '2025-10-01 04:12:34.618043', 'step': 1185, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:34.650819', 'step': 1185, 'epoch': 2} {'type': 'loss', 'content': 0.014348623342812061, 'timestamp': '2025-10-01 04:12:34.653889', 'step': 1186, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:34.686643', 'step': 1186, 'epoch': 2} {'type': 'loss', 'content': 0.006632161792367697, 'timestamp': '2025-10-01 04:12:34.689112', 'step': 1187, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:34.720413', 'step': 1187, 'epoch': 2} {'type': 'loss', 'content': 0.01136785838752985, 'timestamp': '2025-10-01 04:12:34.744434', 'step': 1188, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:34.776530', 'step': 1188, 'epoch': 2} {'type': 'loss', 'content': 0.011251566000282764, 'timestamp': '2025-10-01 04:12:34.779197', 'step': 1189, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:34.811725', 'step': 1189, 'epoch': 2} {'type': 'loss', 'content': 0.030348431318998337, 'timestamp': '2025-10-01 04:12:34.814491', 'step': 1190, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:34.849612', 'step': 1190, 'epoch': 2} {'type': 'loss', 'content': 0.01289790216833353, 'timestamp': '2025-10-01 04:12:34.852248', 'step': 1191, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:34.885484', 'step': 1191, 'epoch': 2} {'type': 'loss', 'content': 0.018557637929916382, 'timestamp': '2025-10-01 04:12:34.909400', 'step': 1192, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:34.940042', 'step': 1192, 'epoch': 2} {'type': 'loss', 'content': 0.017000071704387665, 'timestamp': '2025-10-01 04:12:34.942079', 'step': 1193, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:34.971529', 'step': 1193, 'epoch': 2} {'type': 'loss', 'content': 0.002277073683217168, 'timestamp': '2025-10-01 04:12:34.973523', 'step': 1194, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:35.004690', 'step': 1194, 'epoch': 2} {'type': 'loss', 'content': 0.05108269676566124, 'timestamp': '2025-10-01 04:12:35.006447', 'step': 1195, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:35.036451', 'step': 1195, 'epoch': 2} {'type': 'loss', 'content': 0.014600159600377083, 'timestamp': '2025-10-01 04:12:35.060051', 'step': 1196, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:35.090421', 'step': 1196, 'epoch': 2} {'type': 'loss', 'content': 0.05920051410794258, 'timestamp': '2025-10-01 04:12:35.092657', 'step': 1197, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:12:35.819119', 'step': 1197, 'epoch': 2} {'type': 'pplx', 'content': 69101049.67867415, 'timestamp': '2025-10-01 04:12:35.821244', 'step': 1197, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:35.849494', 'step': 1197, 'epoch': 2} {'type': 'loss', 'content': 0.0065787904895842075, 'timestamp': '2025-10-01 04:12:35.851545', 'step': 1198, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:35.881617', 'step': 1198, 'epoch': 2} {'type': 'loss', 'content': 0.011520309373736382, 'timestamp': '2025-10-01 04:12:35.883572', 'step': 1199, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:35.915336', 'step': 1199, 'epoch': 2} {'type': 'loss', 'content': 0.023187484592199326, 'timestamp': '2025-10-01 04:12:35.939632', 'step': 1200, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:35.970263', 'step': 1200, 'epoch': 2} {'type': 'loss', 'content': 0.052248697727918625, 'timestamp': '2025-10-01 04:12:35.972760', 'step': 1201, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.004487', 'step': 1201, 'epoch': 2} {'type': 'loss', 'content': 0.00456772418692708, 'timestamp': '2025-10-01 04:12:36.006449', 'step': 1202, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.036235', 'step': 1202, 'epoch': 2} {'type': 'loss', 'content': 0.0023930370807647705, 'timestamp': '2025-10-01 04:12:36.038260', 'step': 1203, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:12:36.068377', 'step': 1203, 'epoch': 2} {'type': 'loss', 'content': 0.004792022053152323, 'timestamp': '2025-10-01 04:12:36.092087', 'step': 1204, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:36.122527', 'step': 1204, 'epoch': 2} {'type': 'loss', 'content': 0.03757337108254433, 'timestamp': '2025-10-01 04:12:36.124523', 'step': 1205, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.154170', 'step': 1205, 'epoch': 2} {'type': 'loss', 'content': 0.03808292746543884, 'timestamp': '2025-10-01 04:12:36.156368', 'step': 1206, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.186193', 'step': 1206, 'epoch': 2} {'type': 'loss', 'content': 0.0030230432748794556, 'timestamp': '2025-10-01 04:12:36.188496', 'step': 1207, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.219100', 'step': 1207, 'epoch': 2} {'type': 'loss', 'content': 0.015412569046020508, 'timestamp': '2025-10-01 04:12:36.242563', 'step': 1208, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:36.272963', 'step': 1208, 'epoch': 2} {'type': 'loss', 'content': 0.014492099173367023, 'timestamp': '2025-10-01 04:12:36.275009', 'step': 1209, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.305306', 'step': 1209, 'epoch': 2} {'type': 'loss', 'content': 0.018064534291625023, 'timestamp': '2025-10-01 04:12:36.307387', 'step': 1210, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.337538', 'step': 1210, 'epoch': 2} {'type': 'loss', 'content': 0.03734589368104935, 'timestamp': '2025-10-01 04:12:36.339730', 'step': 1211, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.369990', 'step': 1211, 'epoch': 2} {'type': 'loss', 'content': 0.015398058108985424, 'timestamp': '2025-10-01 04:12:36.393763', 'step': 1212, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:36.424249', 'step': 1212, 'epoch': 2} {'type': 'loss', 'content': 0.008857528679072857, 'timestamp': '2025-10-01 04:12:36.426096', 'step': 1213, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.457153', 'step': 1213, 'epoch': 2} {'type': 'loss', 'content': 0.012673860415816307, 'timestamp': '2025-10-01 04:12:36.459215', 'step': 1214, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.489247', 'step': 1214, 'epoch': 2} {'type': 'loss', 'content': 0.01826346106827259, 'timestamp': '2025-10-01 04:12:36.491135', 'step': 1215, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:36.521267', 'step': 1215, 'epoch': 2} {'type': 'loss', 'content': 0.022028954699635506, 'timestamp': '2025-10-01 04:12:36.545367', 'step': 1216, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:36.575524', 'step': 1216, 'epoch': 2} {'type': 'loss', 'content': 0.013720040209591389, 'timestamp': '2025-10-01 04:12:36.577401', 'step': 1217, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.607402', 'step': 1217, 'epoch': 2} {'type': 'loss', 'content': 0.00724714994430542, 'timestamp': '2025-10-01 04:12:36.609833', 'step': 1218, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:36.639914', 'step': 1218, 'epoch': 2} {'type': 'loss', 'content': 0.008106841705739498, 'timestamp': '2025-10-01 04:12:36.642402', 'step': 1219, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.672793', 'step': 1219, 'epoch': 2} {'type': 'loss', 'content': 0.009874098002910614, 'timestamp': '2025-10-01 04:12:36.696779', 'step': 1220, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.728949', 'step': 1220, 'epoch': 2} {'type': 'loss', 'content': 0.01156134158372879, 'timestamp': '2025-10-01 04:12:36.731050', 'step': 1221, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.762049', 'step': 1221, 'epoch': 2} {'type': 'loss', 'content': 0.007826543413102627, 'timestamp': '2025-10-01 04:12:36.764101', 'step': 1222, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.793637', 'step': 1222, 'epoch': 2} {'type': 'loss', 'content': 0.008319816552102566, 'timestamp': '2025-10-01 04:12:36.795713', 'step': 1223, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.825563', 'step': 1223, 'epoch': 2} {'type': 'loss', 'content': 0.010037191212177277, 'timestamp': '2025-10-01 04:12:36.849058', 'step': 1224, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.879230', 'step': 1224, 'epoch': 2} {'type': 'loss', 'content': 0.012897380627691746, 'timestamp': '2025-10-01 04:12:36.881410', 'step': 1225, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.911686', 'step': 1225, 'epoch': 2} {'type': 'loss', 'content': 0.004419870208948851, 'timestamp': '2025-10-01 04:12:36.913889', 'step': 1226, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.944695', 'step': 1226, 'epoch': 2} {'type': 'loss', 'content': 0.011351182125508785, 'timestamp': '2025-10-01 04:12:36.946817', 'step': 1227, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:36.976795', 'step': 1227, 'epoch': 2} {'type': 'loss', 'content': 0.016698114573955536, 'timestamp': '2025-10-01 04:12:37.000503', 'step': 1228, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:37.030489', 'step': 1228, 'epoch': 2} {'type': 'loss', 'content': 0.005861933343112469, 'timestamp': '2025-10-01 04:12:37.032403', 'step': 1229, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:37.062255', 'step': 1229, 'epoch': 2} {'type': 'loss', 'content': 0.029591960832476616, 'timestamp': '2025-10-01 04:12:37.064641', 'step': 1230, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:37.095276', 'step': 1230, 'epoch': 2} {'type': 'loss', 'content': 0.008629663847386837, 'timestamp': '2025-10-01 04:12:37.097268', 'step': 1231, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:37.127494', 'step': 1231, 'epoch': 2} {'type': 'loss', 'content': 0.021872971206903458, 'timestamp': '2025-10-01 04:12:37.151206', 'step': 1232, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:37.183106', 'step': 1232, 'epoch': 2} {'type': 'loss', 'content': 0.0034721808042377234, 'timestamp': '2025-10-01 04:12:37.185328', 'step': 1233, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:37.215008', 'step': 1233, 'epoch': 2} {'type': 'loss', 'content': 0.00947582721710205, 'timestamp': '2025-10-01 04:12:37.216971', 'step': 1234, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:37.246465', 'step': 1234, 'epoch': 2} {'type': 'loss', 'content': 0.01878984272480011, 'timestamp': '2025-10-01 04:12:37.248612', 'step': 1235, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:37.278641', 'step': 1235, 'epoch': 2} {'type': 'loss', 'content': 0.011062067933380604, 'timestamp': '2025-10-01 04:12:37.302135', 'step': 1236, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:37.332156', 'step': 1236, 'epoch': 2} {'type': 'loss', 'content': 0.01161190029233694, 'timestamp': '2025-10-01 04:12:37.334066', 'step': 1237, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:37.363959', 'step': 1237, 'epoch': 2} {'type': 'loss', 'content': 0.01545110996812582, 'timestamp': '2025-10-01 04:12:37.366521', 'step': 1238, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:37.396744', 'step': 1238, 'epoch': 2} {'type': 'loss', 'content': 0.02992149256169796, 'timestamp': '2025-10-01 04:12:37.399040', 'step': 1239, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:37.428977', 'step': 1239, 'epoch': 2} {'type': 'loss', 'content': 0.01887401007115841, 'timestamp': '2025-10-01 04:12:37.452764', 'step': 1240, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:37.483043', 'step': 1240, 'epoch': 2} {'type': 'loss', 'content': 0.007618209812790155, 'timestamp': '2025-10-01 04:12:37.485226', 'step': 1241, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:37.515270', 'step': 1241, 'epoch': 2} {'type': 'loss', 'content': 0.017871305346488953, 'timestamp': '2025-10-01 04:12:37.517902', 'step': 1242, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:37.547853', 'step': 1242, 'epoch': 2} {'type': 'loss', 'content': 0.02309580147266388, 'timestamp': '2025-10-01 04:12:37.550234', 'step': 1243, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:37.580443', 'step': 1243, 'epoch': 2} {'type': 'loss', 'content': 0.023178985342383385, 'timestamp': '2025-10-01 04:12:37.604271', 'step': 1244, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:12:37.635392', 'step': 1244, 'epoch': 2} {'type': 'loss', 'content': 0.028510218486189842, 'timestamp': '2025-10-01 04:12:37.637384', 'step': 1245, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:37.667795', 'step': 1245, 'epoch': 2} {'type': 'loss', 'content': 0.02059764787554741, 'timestamp': '2025-10-01 04:12:37.670323', 'step': 1246, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:37.700398', 'step': 1246, 'epoch': 2} {'type': 'loss', 'content': 0.02575409784913063, 'timestamp': '2025-10-01 04:12:37.702534', 'step': 1247, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:37.732556', 'step': 1247, 'epoch': 2} {'type': 'loss', 'content': 0.01112529169768095, 'timestamp': '2025-10-01 04:12:37.756165', 'step': 1248, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:37.786576', 'step': 1248, 'epoch': 2} {'type': 'loss', 'content': 0.026837946847081184, 'timestamp': '2025-10-01 04:12:37.788497', 'step': 1249, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:37.818068', 'step': 1249, 'epoch': 2} {'type': 'loss', 'content': 0.030917344614863396, 'timestamp': '2025-10-01 04:12:37.820132', 'step': 1250, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:37.851712', 'step': 1250, 'epoch': 2} {'type': 'loss', 'content': 0.022184815257787704, 'timestamp': '2025-10-01 04:12:37.853716', 'step': 1251, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:37.883225', 'step': 1251, 'epoch': 2} {'type': 'loss', 'content': 0.009707905352115631, 'timestamp': '2025-10-01 04:12:37.907652', 'step': 1252, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:12:37.937552', 'step': 1252, 'epoch': 2} {'type': 'loss', 'content': 0.003488952526822686, 'timestamp': '2025-10-01 04:12:37.939557', 'step': 1253, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:37.969523', 'step': 1253, 'epoch': 2} {'type': 'loss', 'content': 0.00924562569707632, 'timestamp': '2025-10-01 04:12:37.971995', 'step': 1254, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:12:38.718639', 'step': 1254, 'epoch': 2} {'type': 'pplx', 'content': 72587363.3396665, 'timestamp': '2025-10-01 04:12:38.722363', 'step': 1254, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:38.755399', 'step': 1254, 'epoch': 2} {'type': 'loss', 'content': 0.007001742720603943, 'timestamp': '2025-10-01 04:12:38.759029', 'step': 1255, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:38.793427', 'step': 1255, 'epoch': 2} {'type': 'loss', 'content': 0.027426881715655327, 'timestamp': '2025-10-01 04:12:38.822925', 'step': 1256, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:38.856954', 'step': 1256, 'epoch': 2} {'type': 'loss', 'content': 0.01879812777042389, 'timestamp': '2025-10-01 04:12:38.860658', 'step': 1257, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:38.894720', 'step': 1257, 'epoch': 2} {'type': 'loss', 'content': 0.01835942268371582, 'timestamp': '2025-10-01 04:12:38.900475', 'step': 1258, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:38.936837', 'step': 1258, 'epoch': 2} {'type': 'loss', 'content': 0.009284531697630882, 'timestamp': '2025-10-01 04:12:38.940589', 'step': 1259, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:38.974956', 'step': 1259, 'epoch': 2} {'type': 'loss', 'content': 0.01425645500421524, 'timestamp': '2025-10-01 04:12:38.998488', 'step': 1260, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:39.034927', 'step': 1260, 'epoch': 2} {'type': 'loss', 'content': 0.050299111753702164, 'timestamp': '2025-10-01 04:12:39.039921', 'step': 1261, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:39.103074', 'step': 1261, 'epoch': 2} {'type': 'loss', 'content': 0.01674676313996315, 'timestamp': '2025-10-01 04:12:39.118546', 'step': 1262, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:39.196189', 'step': 1262, 'epoch': 2} {'type': 'loss', 'content': 0.02486700937151909, 'timestamp': '2025-10-01 04:12:39.199387', 'step': 1263, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:39.271126', 'step': 1263, 'epoch': 2} {'type': 'loss', 'content': 0.015348898246884346, 'timestamp': '2025-10-01 04:12:39.309601', 'step': 1264, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:39.382649', 'step': 1264, 'epoch': 2} {'type': 'loss', 'content': 0.01046433113515377, 'timestamp': '2025-10-01 04:12:39.397938', 'step': 1265, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:39.474775', 'step': 1265, 'epoch': 2} {'type': 'loss', 'content': 0.013404331170022488, 'timestamp': '2025-10-01 04:12:39.487640', 'step': 1266, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:39.559680', 'step': 1266, 'epoch': 2} {'type': 'loss', 'content': 0.01794937252998352, 'timestamp': '2025-10-01 04:12:39.565149', 'step': 1267, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:39.604968', 'step': 1267, 'epoch': 2} {'type': 'loss', 'content': 0.027332397177815437, 'timestamp': '2025-10-01 04:12:39.642227', 'step': 1268, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:39.713955', 'step': 1268, 'epoch': 2} {'type': 'loss', 'content': 0.016648003831505775, 'timestamp': '2025-10-01 04:12:39.727933', 'step': 1269, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:39.801066', 'step': 1269, 'epoch': 2} {'type': 'loss', 'content': 0.02556251361966133, 'timestamp': '2025-10-01 04:12:39.818648', 'step': 1270, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:39.862577', 'step': 1270, 'epoch': 2} {'type': 'loss', 'content': 0.007685820106416941, 'timestamp': '2025-10-01 04:12:39.873162', 'step': 1271, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:39.937443', 'step': 1271, 'epoch': 2} {'type': 'loss', 'content': 0.021718529984354973, 'timestamp': '2025-10-01 04:12:39.967126', 'step': 1272, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:40.026398', 'step': 1272, 'epoch': 2} {'type': 'loss', 'content': 0.008440138772130013, 'timestamp': '2025-10-01 04:12:40.030765', 'step': 1273, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:40.079568', 'step': 1273, 'epoch': 2} {'type': 'loss', 'content': 0.022746693342924118, 'timestamp': '2025-10-01 04:12:40.086934', 'step': 1274, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:40.142492', 'step': 1274, 'epoch': 2} {'type': 'loss', 'content': 0.007633621338754892, 'timestamp': '2025-10-01 04:12:40.153098', 'step': 1275, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:40.208552', 'step': 1275, 'epoch': 2} {'type': 'loss', 'content': 0.005806318949908018, 'timestamp': '2025-10-01 04:12:40.233761', 'step': 1276, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:40.274585', 'step': 1276, 'epoch': 2} {'type': 'loss', 'content': 0.016788044944405556, 'timestamp': '2025-10-01 04:12:40.277998', 'step': 1277, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:40.334141', 'step': 1277, 'epoch': 2} {'type': 'loss', 'content': 0.01413695514202118, 'timestamp': '2025-10-01 04:12:40.345076', 'step': 1278, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:40.399249', 'step': 1278, 'epoch': 2} {'type': 'loss', 'content': 0.03610283136367798, 'timestamp': '2025-10-01 04:12:40.404084', 'step': 1279, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:40.444491', 'step': 1279, 'epoch': 2} {'type': 'loss', 'content': 0.01976608671247959, 'timestamp': '2025-10-01 04:12:40.477011', 'step': 1280, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:40.532785', 'step': 1280, 'epoch': 2} {'type': 'loss', 'content': 0.010782641358673573, 'timestamp': '2025-10-01 04:12:40.543694', 'step': 1281, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:40.599570', 'step': 1281, 'epoch': 2} {'type': 'loss', 'content': 0.01856873743236065, 'timestamp': '2025-10-01 04:12:40.609880', 'step': 1282, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:40.663642', 'step': 1282, 'epoch': 2} {'type': 'loss', 'content': 0.008483139798045158, 'timestamp': '2025-10-01 04:12:40.673668', 'step': 1283, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:12:40.737905', 'step': 1283, 'epoch': 2} {'type': 'loss', 'content': 0.007417923770844936, 'timestamp': '2025-10-01 04:12:40.769858', 'step': 1284, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:40.812176', 'step': 1284, 'epoch': 2} {'type': 'loss', 'content': 0.030951378867030144, 'timestamp': '2025-10-01 04:12:40.816743', 'step': 1285, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:40.873339', 'step': 1285, 'epoch': 2} {'type': 'loss', 'content': 0.006046353839337826, 'timestamp': '2025-10-01 04:12:40.881636', 'step': 1286, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:40.931805', 'step': 1286, 'epoch': 2} {'type': 'loss', 'content': 0.01013564970344305, 'timestamp': '2025-10-01 04:12:40.942893', 'step': 1287, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:41.000376', 'step': 1287, 'epoch': 2} {'type': 'loss', 'content': 0.005417356733232737, 'timestamp': '2025-10-01 04:12:41.032435', 'step': 1288, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:41.086631', 'step': 1288, 'epoch': 2} {'type': 'loss', 'content': 0.03147173300385475, 'timestamp': '2025-10-01 04:12:41.088946', 'step': 1289, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:41.133055', 'step': 1289, 'epoch': 2} {'type': 'loss', 'content': 0.012722591869533062, 'timestamp': '2025-10-01 04:12:41.142496', 'step': 1290, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:41.196519', 'step': 1290, 'epoch': 2} {'type': 'loss', 'content': 0.018516169860959053, 'timestamp': '2025-10-01 04:12:41.207964', 'step': 1291, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:41.261168', 'step': 1291, 'epoch': 2} {'type': 'loss', 'content': 0.015589392744004726, 'timestamp': '2025-10-01 04:12:41.293501', 'step': 1292, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:41.342087', 'step': 1292, 'epoch': 2} {'type': 'loss', 'content': 0.004773109219968319, 'timestamp': '2025-10-01 04:12:41.345519', 'step': 1293, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:41.397055', 'step': 1293, 'epoch': 2} {'type': 'loss', 'content': 0.006328233052045107, 'timestamp': '2025-10-01 04:12:41.408769', 'step': 1294, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:41.464362', 'step': 1294, 'epoch': 2} {'type': 'loss', 'content': 0.00493656238541007, 'timestamp': '2025-10-01 04:12:41.476386', 'step': 1295, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:41.534489', 'step': 1295, 'epoch': 2} {'type': 'loss', 'content': 0.027140628546476364, 'timestamp': '2025-10-01 04:12:41.564498', 'step': 1296, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:41.607600', 'step': 1296, 'epoch': 2} {'type': 'loss', 'content': 0.01725948601961136, 'timestamp': '2025-10-01 04:12:41.616107', 'step': 1297, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:41.665739', 'step': 1297, 'epoch': 2} {'type': 'loss', 'content': 0.021593991667032242, 'timestamp': '2025-10-01 04:12:41.675446', 'step': 1298, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:41.733772', 'step': 1298, 'epoch': 2} {'type': 'loss', 'content': 0.023603880777955055, 'timestamp': '2025-10-01 04:12:41.738912', 'step': 1299, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:41.797366', 'step': 1299, 'epoch': 2} {'type': 'loss', 'content': 0.013666218146681786, 'timestamp': '2025-10-01 04:12:41.826305', 'step': 1300, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:41.867111', 'step': 1300, 'epoch': 2} {'type': 'loss', 'content': 0.02010071836411953, 'timestamp': '2025-10-01 04:12:41.876920', 'step': 1301, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:41.929900', 'step': 1301, 'epoch': 2} {'type': 'loss', 'content': 0.011804967187345028, 'timestamp': '2025-10-01 04:12:41.938433', 'step': 1302, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:41.995283', 'step': 1302, 'epoch': 2} {'type': 'loss', 'content': 0.014817488379776478, 'timestamp': '2025-10-01 04:12:42.005369', 'step': 1303, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:42.063041', 'step': 1303, 'epoch': 2} {'type': 'loss', 'content': 0.006263951305299997, 'timestamp': '2025-10-01 04:12:42.097569', 'step': 1304, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:42.155080', 'step': 1304, 'epoch': 2} {'type': 'loss', 'content': 0.013046405278146267, 'timestamp': '2025-10-01 04:12:42.159911', 'step': 1305, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:42.233711', 'step': 1305, 'epoch': 2} {'type': 'loss', 'content': 0.00669970503076911, 'timestamp': '2025-10-01 04:12:42.245833', 'step': 1306, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:42.303415', 'step': 1306, 'epoch': 2} {'type': 'loss', 'content': 0.015233458951115608, 'timestamp': '2025-10-01 04:12:42.309314', 'step': 1307, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:42.362067', 'step': 1307, 'epoch': 2} {'type': 'loss', 'content': 0.01641346700489521, 'timestamp': '2025-10-01 04:12:42.388069', 'step': 1308, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:42.443349', 'step': 1308, 'epoch': 2} {'type': 'loss', 'content': 0.007500012405216694, 'timestamp': '2025-10-01 04:12:42.450010', 'step': 1309, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:42.511000', 'step': 1309, 'epoch': 2} {'type': 'loss', 'content': 0.015264661982655525, 'timestamp': '2025-10-01 04:12:42.520989', 'step': 1310, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:12:42.578916', 'step': 1310, 'epoch': 2} {'type': 'loss', 'content': 0.018528863787651062, 'timestamp': '2025-10-01 04:12:42.588548', 'step': 1311, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:12:44.227776', 'step': 1311, 'epoch': 2} {'type': 'pplx', 'content': 78869573.43853244, 'timestamp': '2025-10-01 04:12:44.245101', 'step': 1311, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:44.299810', 'step': 1311, 'epoch': 2} {'type': 'loss', 'content': 0.0038048250135034323, 'timestamp': '2025-10-01 04:12:44.366906', 'step': 1312, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:44.424004', 'step': 1312, 'epoch': 2} {'type': 'loss', 'content': 0.006456837523728609, 'timestamp': '2025-10-01 04:12:44.432596', 'step': 1313, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:44.476682', 'step': 1313, 'epoch': 2} {'type': 'loss', 'content': 0.0011937960516661406, 'timestamp': '2025-10-01 04:12:44.486853', 'step': 1314, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:44.561622', 'step': 1314, 'epoch': 2} {'type': 'loss', 'content': 0.015916863456368446, 'timestamp': '2025-10-01 04:12:44.577906', 'step': 1315, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:44.633700', 'step': 1315, 'epoch': 2} {'type': 'loss', 'content': 0.00806934293359518, 'timestamp': '2025-10-01 04:12:44.667446', 'step': 1316, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:44.718982', 'step': 1316, 'epoch': 2} {'type': 'loss', 'content': 0.01778421550989151, 'timestamp': '2025-10-01 04:12:44.730100', 'step': 1317, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:44.793413', 'step': 1317, 'epoch': 2} {'type': 'loss', 'content': 0.0024733245372772217, 'timestamp': '2025-10-01 04:12:44.807968', 'step': 1318, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:44.867276', 'step': 1318, 'epoch': 2} {'type': 'loss', 'content': 0.02881716750562191, 'timestamp': '2025-10-01 04:12:44.879554', 'step': 1319, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:44.951970', 'step': 1319, 'epoch': 2} {'type': 'loss', 'content': 0.007030692417174578, 'timestamp': '2025-10-01 04:12:44.983607', 'step': 1320, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:45.051807', 'step': 1320, 'epoch': 2} {'type': 'loss', 'content': 0.04227086529135704, 'timestamp': '2025-10-01 04:12:45.074219', 'step': 1321, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:45.171443', 'step': 1321, 'epoch': 2} {'type': 'loss', 'content': 0.029042348265647888, 'timestamp': '2025-10-01 04:12:45.212720', 'step': 1322, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:45.281702', 'step': 1322, 'epoch': 2} {'type': 'loss', 'content': 0.01102940458804369, 'timestamp': '2025-10-01 04:12:45.295208', 'step': 1323, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:45.386960', 'step': 1323, 'epoch': 2} {'type': 'loss', 'content': 0.006163137499243021, 'timestamp': '2025-10-01 04:12:45.426504', 'step': 1324, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:45.482658', 'step': 1324, 'epoch': 2} {'type': 'loss', 'content': 0.0057218037545681, 'timestamp': '2025-10-01 04:12:45.509154', 'step': 1325, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:45.571372', 'step': 1325, 'epoch': 2} {'type': 'loss', 'content': 0.00786365382373333, 'timestamp': '2025-10-01 04:12:45.579516', 'step': 1326, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:45.645173', 'step': 1326, 'epoch': 2} {'type': 'loss', 'content': 0.007059741299599409, 'timestamp': '2025-10-01 04:12:45.648177', 'step': 1327, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:45.702150', 'step': 1327, 'epoch': 2} {'type': 'loss', 'content': 0.008876658976078033, 'timestamp': '2025-10-01 04:12:45.733461', 'step': 1328, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:45.787199', 'step': 1328, 'epoch': 2} {'type': 'loss', 'content': 0.01941383071243763, 'timestamp': '2025-10-01 04:12:45.797192', 'step': 1329, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:45.848611', 'step': 1329, 'epoch': 2} {'type': 'loss', 'content': 0.005369930062443018, 'timestamp': '2025-10-01 04:12:45.862259', 'step': 1330, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:45.920605', 'step': 1330, 'epoch': 2} {'type': 'loss', 'content': 0.04618501663208008, 'timestamp': '2025-10-01 04:12:45.924922', 'step': 1331, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:45.981540', 'step': 1331, 'epoch': 2} {'type': 'loss', 'content': 0.014543636702001095, 'timestamp': '2025-10-01 04:12:46.014255', 'step': 1332, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:46.059662', 'step': 1332, 'epoch': 2} {'type': 'loss', 'content': 0.01306221354752779, 'timestamp': '2025-10-01 04:12:46.070833', 'step': 1333, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:46.132546', 'step': 1333, 'epoch': 2} {'type': 'loss', 'content': 0.016116904094815254, 'timestamp': '2025-10-01 04:12:46.142433', 'step': 1334, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:46.200866', 'step': 1334, 'epoch': 2} {'type': 'loss', 'content': 0.025694649666547775, 'timestamp': '2025-10-01 04:12:46.210264', 'step': 1335, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:46.267224', 'step': 1335, 'epoch': 2} {'type': 'loss', 'content': 0.006386879365891218, 'timestamp': '2025-10-01 04:12:46.296373', 'step': 1336, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:46.346554', 'step': 1336, 'epoch': 2} {'type': 'loss', 'content': 0.011319021694362164, 'timestamp': '2025-10-01 04:12:46.355555', 'step': 1337, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:46.411148', 'step': 1337, 'epoch': 2} {'type': 'loss', 'content': 0.016311505809426308, 'timestamp': '2025-10-01 04:12:46.419762', 'step': 1338, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:46.478960', 'step': 1338, 'epoch': 2} {'type': 'loss', 'content': 0.003795040538534522, 'timestamp': '2025-10-01 04:12:46.486964', 'step': 1339, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:46.542375', 'step': 1339, 'epoch': 2} {'type': 'loss', 'content': 0.014971991069614887, 'timestamp': '2025-10-01 04:12:46.572419', 'step': 1340, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:46.622727', 'step': 1340, 'epoch': 2} {'type': 'loss', 'content': 0.02324819751083851, 'timestamp': '2025-10-01 04:12:46.632490', 'step': 1341, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:46.674567', 'step': 1341, 'epoch': 2} {'type': 'loss', 'content': 0.009367035701870918, 'timestamp': '2025-10-01 04:12:46.685230', 'step': 1342, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:46.746431', 'step': 1342, 'epoch': 2} {'type': 'loss', 'content': 0.008872377686202526, 'timestamp': '2025-10-01 04:12:46.756063', 'step': 1343, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:46.812618', 'step': 1343, 'epoch': 2} {'type': 'loss', 'content': 0.015090836212038994, 'timestamp': '2025-10-01 04:12:46.842621', 'step': 1344, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:46.885791', 'step': 1344, 'epoch': 2} {'type': 'loss', 'content': 0.0024052553344517946, 'timestamp': '2025-10-01 04:12:46.888946', 'step': 1345, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:46.942596', 'step': 1345, 'epoch': 2} {'type': 'loss', 'content': 0.015845181420445442, 'timestamp': '2025-10-01 04:12:46.952123', 'step': 1346, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:46.998233', 'step': 1346, 'epoch': 2} {'type': 'loss', 'content': 0.015794331207871437, 'timestamp': '2025-10-01 04:12:47.001259', 'step': 1347, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:47.049684', 'step': 1347, 'epoch': 2} {'type': 'loss', 'content': 0.0033900614362210035, 'timestamp': '2025-10-01 04:12:47.074828', 'step': 1348, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:47.139834', 'step': 1348, 'epoch': 2} {'type': 'loss', 'content': 0.0025195071939378977, 'timestamp': '2025-10-01 04:12:47.144106', 'step': 1349, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:47.198101', 'step': 1349, 'epoch': 2} {'type': 'loss', 'content': 0.007320540491491556, 'timestamp': '2025-10-01 04:12:47.200675', 'step': 1350, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:12:47.268689', 'step': 1350, 'epoch': 2} {'type': 'loss', 'content': 0.006169808562844992, 'timestamp': '2025-10-01 04:12:47.284157', 'step': 1351, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:47.344166', 'step': 1351, 'epoch': 2} {'type': 'loss', 'content': 0.0029406158719211817, 'timestamp': '2025-10-01 04:12:47.375236', 'step': 1352, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:47.443664', 'step': 1352, 'epoch': 2} {'type': 'loss', 'content': 0.004101304803043604, 'timestamp': '2025-10-01 04:12:47.456094', 'step': 1353, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:47.513465', 'step': 1353, 'epoch': 2} {'type': 'loss', 'content': 0.019129853695631027, 'timestamp': '2025-10-01 04:12:47.526344', 'step': 1354, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:47.585727', 'step': 1354, 'epoch': 2} {'type': 'loss', 'content': 0.022670626640319824, 'timestamp': '2025-10-01 04:12:47.595314', 'step': 1355, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:47.649494', 'step': 1355, 'epoch': 2} {'type': 'loss', 'content': 0.03189031034708023, 'timestamp': '2025-10-01 04:12:47.674507', 'step': 1356, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:47.719419', 'step': 1356, 'epoch': 2} {'type': 'loss', 'content': 0.009254112839698792, 'timestamp': '2025-10-01 04:12:47.732305', 'step': 1357, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:47.788157', 'step': 1357, 'epoch': 2} {'type': 'loss', 'content': 0.008511737920343876, 'timestamp': '2025-10-01 04:12:47.799441', 'step': 1358, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:47.859103', 'step': 1358, 'epoch': 2} {'type': 'loss', 'content': 0.02656170353293419, 'timestamp': '2025-10-01 04:12:47.873133', 'step': 1359, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:47.914929', 'step': 1359, 'epoch': 2} {'type': 'loss', 'content': 0.003011648776009679, 'timestamp': '2025-10-01 04:12:47.950952', 'step': 1360, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:47.992653', 'step': 1360, 'epoch': 2} {'type': 'loss', 'content': 0.03632878139615059, 'timestamp': '2025-10-01 04:12:47.999005', 'step': 1361, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:48.064544', 'step': 1361, 'epoch': 2} {'type': 'loss', 'content': 0.01709390990436077, 'timestamp': '2025-10-01 04:12:48.076366', 'step': 1362, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:48.130187', 'step': 1362, 'epoch': 2} {'type': 'loss', 'content': 0.00863829255104065, 'timestamp': '2025-10-01 04:12:48.133265', 'step': 1363, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:48.184623', 'step': 1363, 'epoch': 2} {'type': 'loss', 'content': 0.003445104230195284, 'timestamp': '2025-10-01 04:12:48.211330', 'step': 1364, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:48.267189', 'step': 1364, 'epoch': 2} {'type': 'loss', 'content': 0.0165807344019413, 'timestamp': '2025-10-01 04:12:48.278535', 'step': 1365, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:48.343826', 'step': 1365, 'epoch': 2} {'type': 'loss', 'content': 0.006906905211508274, 'timestamp': '2025-10-01 04:12:48.352834', 'step': 1366, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:48.397252', 'step': 1366, 'epoch': 2} {'type': 'loss', 'content': 0.01691582053899765, 'timestamp': '2025-10-01 04:12:48.400080', 'step': 1367, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:48.451326', 'step': 1367, 'epoch': 2} {'type': 'loss', 'content': 0.011257420293986797, 'timestamp': '2025-10-01 04:12:48.480696', 'step': 1368, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:12:49.949607', 'step': 1368, 'epoch': 2} {'type': 'pplx', 'content': 85059319.48994762, 'timestamp': '2025-10-01 04:12:49.959371', 'step': 1368, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:50.006244', 'step': 1368, 'epoch': 2} {'type': 'loss', 'content': 0.0026699949521571398, 'timestamp': '2025-10-01 04:12:50.016682', 'step': 1369, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:50.075039', 'step': 1369, 'epoch': 2} {'type': 'loss', 'content': 0.0056277550756931305, 'timestamp': '2025-10-01 04:12:50.087037', 'step': 1370, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:12:50.144334', 'step': 1370, 'epoch': 2} {'type': 'loss', 'content': 0.008586110547184944, 'timestamp': '2025-10-01 04:12:50.154356', 'step': 1371, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:50.211740', 'step': 1371, 'epoch': 2} {'type': 'loss', 'content': 0.0090508246794343, 'timestamp': '2025-10-01 04:12:50.243630', 'step': 1372, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:50.297733', 'step': 1372, 'epoch': 2} {'type': 'loss', 'content': 0.013520614244043827, 'timestamp': '2025-10-01 04:12:50.309767', 'step': 1373, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:50.364701', 'step': 1373, 'epoch': 2} {'type': 'loss', 'content': 0.023014064878225327, 'timestamp': '2025-10-01 04:12:50.374906', 'step': 1374, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:50.430937', 'step': 1374, 'epoch': 2} {'type': 'loss', 'content': 0.025585060939192772, 'timestamp': '2025-10-01 04:12:50.439301', 'step': 1375, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:50.496777', 'step': 1375, 'epoch': 2} {'type': 'loss', 'content': 0.006834993604570627, 'timestamp': '2025-10-01 04:12:50.521045', 'step': 1376, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:50.576978', 'step': 1376, 'epoch': 2} {'type': 'loss', 'content': 0.027560051530599594, 'timestamp': '2025-10-01 04:12:50.587493', 'step': 1377, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:50.648749', 'step': 1377, 'epoch': 2} {'type': 'loss', 'content': 0.01523479912430048, 'timestamp': '2025-10-01 04:12:50.656771', 'step': 1378, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:50.711529', 'step': 1378, 'epoch': 2} {'type': 'loss', 'content': 0.029804764315485954, 'timestamp': '2025-10-01 04:12:50.720572', 'step': 1379, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:50.779679', 'step': 1379, 'epoch': 2} {'type': 'loss', 'content': 0.0074521261267364025, 'timestamp': '2025-10-01 04:12:50.811163', 'step': 1380, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:50.862810', 'step': 1380, 'epoch': 2} {'type': 'loss', 'content': 0.025462573394179344, 'timestamp': '2025-10-01 04:12:50.873074', 'step': 1381, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:50.920966', 'step': 1381, 'epoch': 2} {'type': 'loss', 'content': 0.019137471914291382, 'timestamp': '2025-10-01 04:12:50.932078', 'step': 1382, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:50.988472', 'step': 1382, 'epoch': 2} {'type': 'loss', 'content': 0.022238006815314293, 'timestamp': '2025-10-01 04:12:50.998805', 'step': 1383, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:51.057338', 'step': 1383, 'epoch': 2} {'type': 'loss', 'content': 0.003020750591531396, 'timestamp': '2025-10-01 04:12:51.089118', 'step': 1384, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:51.163894', 'step': 1384, 'epoch': 2} {'type': 'loss', 'content': 0.02706991508603096, 'timestamp': '2025-10-01 04:12:51.176846', 'step': 1385, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:51.259214', 'step': 1385, 'epoch': 2} {'type': 'loss', 'content': 0.028605103492736816, 'timestamp': '2025-10-01 04:12:51.262810', 'step': 1386, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:51.315743', 'step': 1386, 'epoch': 2} {'type': 'loss', 'content': 0.025180544704198837, 'timestamp': '2025-10-01 04:12:51.326277', 'step': 1387, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:51.383324', 'step': 1387, 'epoch': 2} {'type': 'loss', 'content': 0.0067850155755877495, 'timestamp': '2025-10-01 04:12:51.414825', 'step': 1388, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:51.471586', 'step': 1388, 'epoch': 2} {'type': 'loss', 'content': 0.01375055406242609, 'timestamp': '2025-10-01 04:12:51.481756', 'step': 1389, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:51.548022', 'step': 1389, 'epoch': 2} {'type': 'loss', 'content': 0.014415652491152287, 'timestamp': '2025-10-01 04:12:51.554362', 'step': 1390, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:51.605115', 'step': 1390, 'epoch': 2} {'type': 'loss', 'content': 0.007323476020246744, 'timestamp': '2025-10-01 04:12:51.608176', 'step': 1391, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:51.663284', 'step': 1391, 'epoch': 2} {'type': 'loss', 'content': 0.010478307493031025, 'timestamp': '2025-10-01 04:12:51.696534', 'step': 1392, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:51.752897', 'step': 1392, 'epoch': 2} {'type': 'loss', 'content': 0.02164110541343689, 'timestamp': '2025-10-01 04:12:51.766314', 'step': 1393, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:51.822442', 'step': 1393, 'epoch': 2} {'type': 'loss', 'content': 0.023013019934296608, 'timestamp': '2025-10-01 04:12:51.837219', 'step': 1394, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:51.910087', 'step': 1394, 'epoch': 2} {'type': 'loss', 'content': 0.009080707095563412, 'timestamp': '2025-10-01 04:12:51.914319', 'step': 1395, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:51.980232', 'step': 1395, 'epoch': 2} {'type': 'loss', 'content': 0.01225399412214756, 'timestamp': '2025-10-01 04:12:52.020515', 'step': 1396, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:52.095286', 'step': 1396, 'epoch': 2} {'type': 'loss', 'content': 0.020442839711904526, 'timestamp': '2025-10-01 04:12:52.113470', 'step': 1397, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:52.186285', 'step': 1397, 'epoch': 2} {'type': 'loss', 'content': 0.047727856785058975, 'timestamp': '2025-10-01 04:12:52.201680', 'step': 1398, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:52.240690', 'step': 1398, 'epoch': 2} {'type': 'loss', 'content': 0.0108518460765481, 'timestamp': '2025-10-01 04:12:52.254418', 'step': 1399, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:52.308623', 'step': 1399, 'epoch': 2} {'type': 'loss', 'content': 0.011700905859470367, 'timestamp': '2025-10-01 04:12:52.333993', 'step': 1400, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:52.389322', 'step': 1400, 'epoch': 2} {'type': 'loss', 'content': 0.010864133015275002, 'timestamp': '2025-10-01 04:12:52.393341', 'step': 1401, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:52.457203', 'step': 1401, 'epoch': 2} {'type': 'loss', 'content': 0.01521242968738079, 'timestamp': '2025-10-01 04:12:52.467015', 'step': 1402, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:52.522928', 'step': 1402, 'epoch': 2} {'type': 'loss', 'content': 0.009760470129549503, 'timestamp': '2025-10-01 04:12:52.534789', 'step': 1403, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:52.591165', 'step': 1403, 'epoch': 2} {'type': 'loss', 'content': 0.0272379107773304, 'timestamp': '2025-10-01 04:12:52.624671', 'step': 1404, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:52.684040', 'step': 1404, 'epoch': 2} {'type': 'loss', 'content': 0.01665526069700718, 'timestamp': '2025-10-01 04:12:52.694939', 'step': 1405, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:52.748565', 'step': 1405, 'epoch': 2} {'type': 'loss', 'content': 0.02335176430642605, 'timestamp': '2025-10-01 04:12:52.756002', 'step': 1406, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:52.809191', 'step': 1406, 'epoch': 2} {'type': 'loss', 'content': 0.004674916621297598, 'timestamp': '2025-10-01 04:12:52.813487', 'step': 1407, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:52.851729', 'step': 1407, 'epoch': 2} {'type': 'loss', 'content': 0.013737602159380913, 'timestamp': '2025-10-01 04:12:52.877217', 'step': 1408, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:52.917518', 'step': 1408, 'epoch': 2} {'type': 'loss', 'content': 0.018468055874109268, 'timestamp': '2025-10-01 04:12:52.924918', 'step': 1409, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:52.972744', 'step': 1409, 'epoch': 2} {'type': 'loss', 'content': 0.004916821606457233, 'timestamp': '2025-10-01 04:12:52.980930', 'step': 1410, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:53.019702', 'step': 1410, 'epoch': 2} {'type': 'loss', 'content': 0.025401007384061813, 'timestamp': '2025-10-01 04:12:53.024393', 'step': 1411, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:53.059235', 'step': 1411, 'epoch': 2} {'type': 'loss', 'content': 0.00917412806302309, 'timestamp': '2025-10-01 04:12:53.084790', 'step': 1412, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:53.122164', 'step': 1412, 'epoch': 2} {'type': 'loss', 'content': 0.009031496942043304, 'timestamp': '2025-10-01 04:12:53.131656', 'step': 1413, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:53.193011', 'step': 1413, 'epoch': 2} {'type': 'loss', 'content': 0.002431566594168544, 'timestamp': '2025-10-01 04:12:53.202473', 'step': 1414, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:53.252552', 'step': 1414, 'epoch': 2} {'type': 'loss', 'content': 0.02220940962433815, 'timestamp': '2025-10-01 04:12:53.261051', 'step': 1415, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:53.311908', 'step': 1415, 'epoch': 2} {'type': 'loss', 'content': 0.008876879699528217, 'timestamp': '2025-10-01 04:12:53.340842', 'step': 1416, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:53.395352', 'step': 1416, 'epoch': 2} {'type': 'loss', 'content': 0.0041095237247645855, 'timestamp': '2025-10-01 04:12:53.405356', 'step': 1417, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:53.456345', 'step': 1417, 'epoch': 2} {'type': 'loss', 'content': 0.006583545822650194, 'timestamp': '2025-10-01 04:12:53.464714', 'step': 1418, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:53.517470', 'step': 1418, 'epoch': 2} {'type': 'loss', 'content': 0.009007715620100498, 'timestamp': '2025-10-01 04:12:53.525617', 'step': 1419, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:53.579787', 'step': 1419, 'epoch': 2} {'type': 'loss', 'content': 0.012834744527935982, 'timestamp': '2025-10-01 04:12:53.612024', 'step': 1420, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:53.676965', 'step': 1420, 'epoch': 2} {'type': 'loss', 'content': 0.017873764038085938, 'timestamp': '2025-10-01 04:12:53.687305', 'step': 1421, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:53.750360', 'step': 1421, 'epoch': 2} {'type': 'loss', 'content': 0.0027618408203125, 'timestamp': '2025-10-01 04:12:53.753766', 'step': 1422, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:53.809743', 'step': 1422, 'epoch': 2} {'type': 'loss', 'content': 0.04755884408950806, 'timestamp': '2025-10-01 04:12:53.812911', 'step': 1423, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:53.865498', 'step': 1423, 'epoch': 2} {'type': 'loss', 'content': 0.026457469910383224, 'timestamp': '2025-10-01 04:12:53.895052', 'step': 1424, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:53.946979', 'step': 1424, 'epoch': 2} {'type': 'loss', 'content': 0.02928990125656128, 'timestamp': '2025-10-01 04:12:53.956292', 'step': 1425, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:12:55.274016', 'step': 1425, 'epoch': 2} {'type': 'pplx', 'content': 81024098.02905558, 'timestamp': '2025-10-01 04:12:55.283137', 'step': 1425, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:55.323662', 'step': 1425, 'epoch': 2} {'type': 'loss', 'content': 0.0035102497786283493, 'timestamp': '2025-10-01 04:12:55.331996', 'step': 1426, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:55.383101', 'step': 1426, 'epoch': 2} {'type': 'loss', 'content': 0.007271726615726948, 'timestamp': '2025-10-01 04:12:55.390564', 'step': 1427, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:55.428679', 'step': 1427, 'epoch': 2} {'type': 'loss', 'content': 0.02840333618223667, 'timestamp': '2025-10-01 04:12:55.459060', 'step': 1428, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:55.503212', 'step': 1428, 'epoch': 2} {'type': 'loss', 'content': 0.004031289368867874, 'timestamp': '2025-10-01 04:12:55.506514', 'step': 1429, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:55.545395', 'step': 1429, 'epoch': 2} {'type': 'loss', 'content': 0.01716061308979988, 'timestamp': '2025-10-01 04:12:55.553106', 'step': 1430, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:55.602464', 'step': 1430, 'epoch': 2} {'type': 'loss', 'content': 0.008836604654788971, 'timestamp': '2025-10-01 04:12:55.606287', 'step': 1431, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:55.653722', 'step': 1431, 'epoch': 2} {'type': 'loss', 'content': 0.012098804116249084, 'timestamp': '2025-10-01 04:12:55.683282', 'step': 1432, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:55.732670', 'step': 1432, 'epoch': 2} {'type': 'loss', 'content': 0.024643605574965477, 'timestamp': '2025-10-01 04:12:55.736455', 'step': 1433, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:55.781764', 'step': 1433, 'epoch': 2} {'type': 'loss', 'content': 0.01903327740728855, 'timestamp': '2025-10-01 04:12:55.790016', 'step': 1434, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:55.826490', 'step': 1434, 'epoch': 2} {'type': 'loss', 'content': 0.016623888164758682, 'timestamp': '2025-10-01 04:12:55.835021', 'step': 1435, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:55.888600', 'step': 1435, 'epoch': 2} {'type': 'loss', 'content': 0.009309085085988045, 'timestamp': '2025-10-01 04:12:55.919764', 'step': 1436, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:55.978186', 'step': 1436, 'epoch': 2} {'type': 'loss', 'content': 0.01641160063445568, 'timestamp': '2025-10-01 04:12:55.985389', 'step': 1437, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:56.033625', 'step': 1437, 'epoch': 2} {'type': 'loss', 'content': 0.013727948069572449, 'timestamp': '2025-10-01 04:12:56.042086', 'step': 1438, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:56.092146', 'step': 1438, 'epoch': 2} {'type': 'loss', 'content': 0.01171187125146389, 'timestamp': '2025-10-01 04:12:56.099344', 'step': 1439, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:56.149026', 'step': 1439, 'epoch': 2} {'type': 'loss', 'content': 0.005448337644338608, 'timestamp': '2025-10-01 04:12:56.177600', 'step': 1440, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:56.226128', 'step': 1440, 'epoch': 2} {'type': 'loss', 'content': 0.005468662362545729, 'timestamp': '2025-10-01 04:12:56.235762', 'step': 1441, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:56.280850', 'step': 1441, 'epoch': 2} {'type': 'loss', 'content': 0.006323696114122868, 'timestamp': '2025-10-01 04:12:56.289895', 'step': 1442, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:56.360876', 'step': 1442, 'epoch': 2} {'type': 'loss', 'content': 0.011916747316718102, 'timestamp': '2025-10-01 04:12:56.368685', 'step': 1443, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:56.422136', 'step': 1443, 'epoch': 2} {'type': 'loss', 'content': 0.0029194927774369717, 'timestamp': '2025-10-01 04:12:56.457307', 'step': 1444, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:56.509042', 'step': 1444, 'epoch': 2} {'type': 'loss', 'content': 0.009464549832046032, 'timestamp': '2025-10-01 04:12:56.511671', 'step': 1445, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:56.558203', 'step': 1445, 'epoch': 2} {'type': 'loss', 'content': 0.014040338806807995, 'timestamp': '2025-10-01 04:12:56.566269', 'step': 1446, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:56.615819', 'step': 1446, 'epoch': 2} {'type': 'loss', 'content': 0.010641560889780521, 'timestamp': '2025-10-01 04:12:56.623893', 'step': 1447, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:56.674826', 'step': 1447, 'epoch': 2} {'type': 'loss', 'content': 0.015267870388925076, 'timestamp': '2025-10-01 04:12:56.706388', 'step': 1448, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:56.746499', 'step': 1448, 'epoch': 2} {'type': 'loss', 'content': 0.017825011163949966, 'timestamp': '2025-10-01 04:12:56.749752', 'step': 1449, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:56.793734', 'step': 1449, 'epoch': 2} {'type': 'loss', 'content': 0.010337586514651775, 'timestamp': '2025-10-01 04:12:56.801961', 'step': 1450, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:56.859522', 'step': 1450, 'epoch': 2} {'type': 'loss', 'content': 0.027803268283605576, 'timestamp': '2025-10-01 04:12:56.866101', 'step': 1451, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:56.917191', 'step': 1451, 'epoch': 2} {'type': 'loss', 'content': 0.010486974380910397, 'timestamp': '2025-10-01 04:12:56.946933', 'step': 1452, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:56.981331', 'step': 1452, 'epoch': 2} {'type': 'loss', 'content': 0.003227656939998269, 'timestamp': '2025-10-01 04:12:56.988088', 'step': 1453, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:57.032161', 'step': 1453, 'epoch': 2} {'type': 'loss', 'content': 0.01615157537162304, 'timestamp': '2025-10-01 04:12:57.040206', 'step': 1454, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:57.090577', 'step': 1454, 'epoch': 2} {'type': 'loss', 'content': 0.016013627871870995, 'timestamp': '2025-10-01 04:12:57.098203', 'step': 1455, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:57.155382', 'step': 1455, 'epoch': 2} {'type': 'loss', 'content': 0.023992856964468956, 'timestamp': '2025-10-01 04:12:57.180844', 'step': 1456, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:57.230465', 'step': 1456, 'epoch': 2} {'type': 'loss', 'content': 0.010106414556503296, 'timestamp': '2025-10-01 04:12:57.240270', 'step': 1457, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:57.281572', 'step': 1457, 'epoch': 2} {'type': 'loss', 'content': 0.02062460407614708, 'timestamp': '2025-10-01 04:12:57.284766', 'step': 1458, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:57.325578', 'step': 1458, 'epoch': 2} {'type': 'loss', 'content': 0.01863592490553856, 'timestamp': '2025-10-01 04:12:57.330741', 'step': 1459, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:57.372236', 'step': 1459, 'epoch': 2} {'type': 'loss', 'content': 0.004499680362641811, 'timestamp': '2025-10-01 04:12:57.395954', 'step': 1460, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:57.427187', 'step': 1460, 'epoch': 2} {'type': 'loss', 'content': 0.03403583914041519, 'timestamp': '2025-10-01 04:12:57.429926', 'step': 1461, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:57.460759', 'step': 1461, 'epoch': 2} {'type': 'loss', 'content': 0.014832577668130398, 'timestamp': '2025-10-01 04:12:57.463370', 'step': 1462, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:57.494456', 'step': 1462, 'epoch': 2} {'type': 'loss', 'content': 0.022800559177994728, 'timestamp': '2025-10-01 04:12:57.496941', 'step': 1463, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:57.527549', 'step': 1463, 'epoch': 2} {'type': 'loss', 'content': 0.022497637197375298, 'timestamp': '2025-10-01 04:12:57.551562', 'step': 1464, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:57.581736', 'step': 1464, 'epoch': 2} {'type': 'loss', 'content': 0.014008629135787487, 'timestamp': '2025-10-01 04:12:57.583809', 'step': 1465, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:57.614061', 'step': 1465, 'epoch': 2} {'type': 'loss', 'content': 0.027394462376832962, 'timestamp': '2025-10-01 04:12:57.616241', 'step': 1466, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:57.647161', 'step': 1466, 'epoch': 2} {'type': 'loss', 'content': 0.03310396149754524, 'timestamp': '2025-10-01 04:12:57.649464', 'step': 1467, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:57.680808', 'step': 1467, 'epoch': 2} {'type': 'loss', 'content': 0.01916583441197872, 'timestamp': '2025-10-01 04:12:57.704447', 'step': 1468, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:57.734862', 'step': 1468, 'epoch': 2} {'type': 'loss', 'content': 0.017947528511285782, 'timestamp': '2025-10-01 04:12:57.737181', 'step': 1469, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:57.793127', 'step': 1469, 'epoch': 2} {'type': 'loss', 'content': 0.026038730517029762, 'timestamp': '2025-10-01 04:12:57.797050', 'step': 1470, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:57.836630', 'step': 1470, 'epoch': 2} {'type': 'loss', 'content': 0.017815809696912766, 'timestamp': '2025-10-01 04:12:57.842061', 'step': 1471, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:57.893379', 'step': 1471, 'epoch': 2} {'type': 'loss', 'content': 0.014733510091900826, 'timestamp': '2025-10-01 04:12:57.919086', 'step': 1472, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:57.961717', 'step': 1472, 'epoch': 2} {'type': 'loss', 'content': 0.007148542441427708, 'timestamp': '2025-10-01 04:12:57.964450', 'step': 1473, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:58.004350', 'step': 1473, 'epoch': 2} {'type': 'loss', 'content': 0.010934674181044102, 'timestamp': '2025-10-01 04:12:58.009758', 'step': 1474, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:58.050044', 'step': 1474, 'epoch': 2} {'type': 'loss', 'content': 0.01579052396118641, 'timestamp': '2025-10-01 04:12:58.055107', 'step': 1475, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:58.097716', 'step': 1475, 'epoch': 2} {'type': 'loss', 'content': 0.013724747113883495, 'timestamp': '2025-10-01 04:12:58.125086', 'step': 1476, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:12:58.182801', 'step': 1476, 'epoch': 2} {'type': 'loss', 'content': 0.006694742478430271, 'timestamp': '2025-10-01 04:12:58.191370', 'step': 1477, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:58.250453', 'step': 1477, 'epoch': 2} {'type': 'loss', 'content': 0.010686053894460201, 'timestamp': '2025-10-01 04:12:58.258702', 'step': 1478, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:58.309810', 'step': 1478, 'epoch': 2} {'type': 'loss', 'content': 0.01659354753792286, 'timestamp': '2025-10-01 04:12:58.316296', 'step': 1479, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:58.359184', 'step': 1479, 'epoch': 2} {'type': 'loss', 'content': 0.013195483945310116, 'timestamp': '2025-10-01 04:12:58.386773', 'step': 1480, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:12:58.428827', 'step': 1480, 'epoch': 2} {'type': 'loss', 'content': 0.01653684861958027, 'timestamp': '2025-10-01 04:12:58.436691', 'step': 1481, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:12:58.481832', 'step': 1481, 'epoch': 2} {'type': 'loss', 'content': 0.01580851338803768, 'timestamp': '2025-10-01 04:12:58.484372', 'step': 1482, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:12:59.963200', 'step': 1482, 'epoch': 2} {'type': 'pplx', 'content': 84949291.70526439, 'timestamp': '2025-10-01 04:12:59.974744', 'step': 1482, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:00.018300', 'step': 1482, 'epoch': 2} {'type': 'loss', 'content': 0.017466850578784943, 'timestamp': '2025-10-01 04:13:00.021522', 'step': 1483, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:00.073224', 'step': 1483, 'epoch': 2} {'type': 'loss', 'content': 0.010928018018603325, 'timestamp': '2025-10-01 04:13:00.104591', 'step': 1484, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:13:00.159071', 'step': 1484, 'epoch': 2} {'type': 'loss', 'content': 0.026852060109376907, 'timestamp': '2025-10-01 04:13:00.162297', 'step': 1485, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:00.197149', 'step': 1485, 'epoch': 2} {'type': 'loss', 'content': 0.016156313940882683, 'timestamp': '2025-10-01 04:13:00.204083', 'step': 1486, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:00.257938', 'step': 1486, 'epoch': 2} {'type': 'loss', 'content': 0.004799918737262487, 'timestamp': '2025-10-01 04:13:00.260677', 'step': 1487, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:00.314212', 'step': 1487, 'epoch': 2} {'type': 'loss', 'content': 0.007315644528716803, 'timestamp': '2025-10-01 04:13:00.344293', 'step': 1488, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:00.392897', 'step': 1488, 'epoch': 2} {'type': 'loss', 'content': 0.012450148351490498, 'timestamp': '2025-10-01 04:13:00.404146', 'step': 1489, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:00.442020', 'step': 1489, 'epoch': 2} {'type': 'loss', 'content': 0.025318952277302742, 'timestamp': '2025-10-01 04:13:00.444497', 'step': 1490, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:00.497482', 'step': 1490, 'epoch': 2} {'type': 'loss', 'content': 0.014360993169248104, 'timestamp': '2025-10-01 04:13:00.507317', 'step': 1491, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:00.563849', 'step': 1491, 'epoch': 2} {'type': 'loss', 'content': 0.008939526043832302, 'timestamp': '2025-10-01 04:13:00.594529', 'step': 1492, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:00.640984', 'step': 1492, 'epoch': 2} {'type': 'loss', 'content': 0.004608576186001301, 'timestamp': '2025-10-01 04:13:00.644839', 'step': 1493, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:00.698159', 'step': 1493, 'epoch': 2} {'type': 'loss', 'content': 0.016055211424827576, 'timestamp': '2025-10-01 04:13:00.701191', 'step': 1494, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:13:00.744780', 'step': 1494, 'epoch': 2} {'type': 'loss', 'content': 0.0358593687415123, 'timestamp': '2025-10-01 04:13:00.753016', 'step': 1495, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:00.810832', 'step': 1495, 'epoch': 2} {'type': 'loss', 'content': 0.016153011471033096, 'timestamp': '2025-10-01 04:13:00.836891', 'step': 1496, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:00.878392', 'step': 1496, 'epoch': 2} {'type': 'loss', 'content': 0.009653310291469097, 'timestamp': '2025-10-01 04:13:00.882264', 'step': 1497, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:00.917780', 'step': 1497, 'epoch': 2} {'type': 'loss', 'content': 0.00823700986802578, 'timestamp': '2025-10-01 04:13:00.920772', 'step': 1498, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:00.969110', 'step': 1498, 'epoch': 2} {'type': 'loss', 'content': 0.020457221195101738, 'timestamp': '2025-10-01 04:13:00.977680', 'step': 1499, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:01.029140', 'step': 1499, 'epoch': 2} {'type': 'loss', 'content': 0.015353398397564888, 'timestamp': '2025-10-01 04:13:01.061924', 'step': 1500, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1500', 'timestamp': '2025-10-01 04:13:06.393499', 'step': 1500, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:06.442676', 'step': 1500, 'epoch': 2} {'type': 'loss', 'content': 0.009914493188261986, 'timestamp': '2025-10-01 04:13:06.449061', 'step': 1501, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:06.497957', 'step': 1501, 'epoch': 2} {'type': 'loss', 'content': 0.006775428541004658, 'timestamp': '2025-10-01 04:13:06.501114', 'step': 1502, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:06.548913', 'step': 1502, 'epoch': 2} {'type': 'loss', 'content': 0.007616374175995588, 'timestamp': '2025-10-01 04:13:06.554663', 'step': 1503, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:06.603316', 'step': 1503, 'epoch': 2} {'type': 'loss', 'content': 0.010884781368076801, 'timestamp': '2025-10-01 04:13:06.632490', 'step': 1504, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:06.667253', 'step': 1504, 'epoch': 2} {'type': 'loss', 'content': 0.01737024076282978, 'timestamp': '2025-10-01 04:13:06.676654', 'step': 1505, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:06.730152', 'step': 1505, 'epoch': 2} {'type': 'loss', 'content': 0.029502153396606445, 'timestamp': '2025-10-01 04:13:06.738106', 'step': 1506, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:06.777838', 'step': 1506, 'epoch': 2} {'type': 'loss', 'content': 0.03163299709558487, 'timestamp': '2025-10-01 04:13:06.786567', 'step': 1507, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:13:06.835708', 'step': 1507, 'epoch': 2} {'type': 'loss', 'content': 0.012678087688982487, 'timestamp': '2025-10-01 04:13:06.865222', 'step': 1508, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:06.911776', 'step': 1508, 'epoch': 2} {'type': 'loss', 'content': 0.0038244540337473154, 'timestamp': '2025-10-01 04:13:06.915334', 'step': 1509, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:06.967546', 'step': 1509, 'epoch': 2} {'type': 'loss', 'content': 0.021646136417984962, 'timestamp': '2025-10-01 04:13:06.975516', 'step': 1510, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:07.021367', 'step': 1510, 'epoch': 2} {'type': 'loss', 'content': 0.0009849267080426216, 'timestamp': '2025-10-01 04:13:07.030207', 'step': 1511, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:07.079350', 'step': 1511, 'epoch': 2} {'type': 'loss', 'content': 0.009463687427341938, 'timestamp': '2025-10-01 04:13:07.107820', 'step': 1512, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:07.169104', 'step': 1512, 'epoch': 2} {'type': 'loss', 'content': 0.01159172784537077, 'timestamp': '2025-10-01 04:13:07.178023', 'step': 1513, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:07.225267', 'step': 1513, 'epoch': 2} {'type': 'loss', 'content': 0.02389632724225521, 'timestamp': '2025-10-01 04:13:07.234903', 'step': 1514, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:07.278785', 'step': 1514, 'epoch': 2} {'type': 'loss', 'content': 0.006701233331114054, 'timestamp': '2025-10-01 04:13:07.285697', 'step': 1515, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:07.338661', 'step': 1515, 'epoch': 2} {'type': 'loss', 'content': 0.06936667859554291, 'timestamp': '2025-10-01 04:13:07.369101', 'step': 1516, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:07.420893', 'step': 1516, 'epoch': 2} {'type': 'loss', 'content': 0.009653878398239613, 'timestamp': '2025-10-01 04:13:07.426162', 'step': 1517, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:07.475992', 'step': 1517, 'epoch': 2} {'type': 'loss', 'content': 0.026117833331227303, 'timestamp': '2025-10-01 04:13:07.487396', 'step': 1518, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:07.536917', 'step': 1518, 'epoch': 2} {'type': 'loss', 'content': 0.018482133746147156, 'timestamp': '2025-10-01 04:13:07.541204', 'step': 1519, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:07.603762', 'step': 1519, 'epoch': 2} {'type': 'loss', 'content': 0.002146028447896242, 'timestamp': '2025-10-01 04:13:07.634959', 'step': 1520, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:07.685281', 'step': 1520, 'epoch': 2} {'type': 'loss', 'content': 0.01685287430882454, 'timestamp': '2025-10-01 04:13:07.689656', 'step': 1521, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:07.738103', 'step': 1521, 'epoch': 2} {'type': 'loss', 'content': 0.006739395670592785, 'timestamp': '2025-10-01 04:13:07.745658', 'step': 1522, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:07.793751', 'step': 1522, 'epoch': 2} {'type': 'loss', 'content': 0.01353517360985279, 'timestamp': '2025-10-01 04:13:07.801902', 'step': 1523, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:07.854972', 'step': 1523, 'epoch': 2} {'type': 'loss', 'content': 0.012433887459337711, 'timestamp': '2025-10-01 04:13:07.885214', 'step': 1524, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:07.928123', 'step': 1524, 'epoch': 2} {'type': 'loss', 'content': 0.008951112627983093, 'timestamp': '2025-10-01 04:13:07.931001', 'step': 1525, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:07.978806', 'step': 1525, 'epoch': 2} {'type': 'loss', 'content': 0.011042612604796886, 'timestamp': '2025-10-01 04:13:07.988286', 'step': 1526, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:08.031557', 'step': 1526, 'epoch': 2} {'type': 'loss', 'content': 0.010584224946796894, 'timestamp': '2025-10-01 04:13:08.035270', 'step': 1527, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:08.087622', 'step': 1527, 'epoch': 2} {'type': 'loss', 'content': 0.03606724739074707, 'timestamp': '2025-10-01 04:13:08.117541', 'step': 1528, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:08.165808', 'step': 1528, 'epoch': 2} {'type': 'loss', 'content': 0.036213018000125885, 'timestamp': '2025-10-01 04:13:08.172208', 'step': 1529, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:08.220980', 'step': 1529, 'epoch': 2} {'type': 'loss', 'content': 0.0382445752620697, 'timestamp': '2025-10-01 04:13:08.228964', 'step': 1530, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:08.274295', 'step': 1530, 'epoch': 2} {'type': 'loss', 'content': 0.006602444685995579, 'timestamp': '2025-10-01 04:13:08.283038', 'step': 1531, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:08.331547', 'step': 1531, 'epoch': 2} {'type': 'loss', 'content': 0.020674001425504684, 'timestamp': '2025-10-01 04:13:08.356355', 'step': 1532, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:08.396906', 'step': 1532, 'epoch': 2} {'type': 'loss', 'content': 0.0238628126680851, 'timestamp': '2025-10-01 04:13:08.399804', 'step': 1533, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:08.433448', 'step': 1533, 'epoch': 2} {'type': 'loss', 'content': 0.04866022616624832, 'timestamp': '2025-10-01 04:13:08.441887', 'step': 1534, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:08.489206', 'step': 1534, 'epoch': 2} {'type': 'loss', 'content': 0.035724934190511703, 'timestamp': '2025-10-01 04:13:08.503351', 'step': 1535, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:08.571101', 'step': 1535, 'epoch': 2} {'type': 'loss', 'content': 0.026798827573657036, 'timestamp': '2025-10-01 04:13:08.605571', 'step': 1536, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:08.659294', 'step': 1536, 'epoch': 2} {'type': 'loss', 'content': 0.00810581911355257, 'timestamp': '2025-10-01 04:13:08.668819', 'step': 1537, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:08.724892', 'step': 1537, 'epoch': 2} {'type': 'loss', 'content': 0.03755718097090721, 'timestamp': '2025-10-01 04:13:08.728316', 'step': 1538, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:08.777270', 'step': 1538, 'epoch': 2} {'type': 'loss', 'content': 0.004284982569515705, 'timestamp': '2025-10-01 04:13:08.781917', 'step': 1539, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:13:10.471606', 'step': 1539, 'epoch': 2} {'type': 'pplx', 'content': 77655848.63616179, 'timestamp': '2025-10-01 04:13:10.475558', 'step': 1539, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:10.508860', 'step': 1539, 'epoch': 2} {'type': 'loss', 'content': 0.016568129882216454, 'timestamp': '2025-10-01 04:13:10.533188', 'step': 1540, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:10.576631', 'step': 1540, 'epoch': 2} {'type': 'loss', 'content': 0.038749318569898605, 'timestamp': '2025-10-01 04:13:10.582652', 'step': 1541, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:10.626905', 'step': 1541, 'epoch': 2} {'type': 'loss', 'content': 0.01315435953438282, 'timestamp': '2025-10-01 04:13:10.632906', 'step': 1542, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:10.684731', 'step': 1542, 'epoch': 2} {'type': 'loss', 'content': 0.0027639740146696568, 'timestamp': '2025-10-01 04:13:10.693105', 'step': 1543, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:10.731609', 'step': 1543, 'epoch': 2} {'type': 'loss', 'content': 0.003936560358852148, 'timestamp': '2025-10-01 04:13:10.762627', 'step': 1544, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:10.813420', 'step': 1544, 'epoch': 2} {'type': 'loss', 'content': 0.005857733078300953, 'timestamp': '2025-10-01 04:13:10.821369', 'step': 1545, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:10.872193', 'step': 1545, 'epoch': 2} {'type': 'loss', 'content': 0.04366546869277954, 'timestamp': '2025-10-01 04:13:10.881688', 'step': 1546, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:10.941190', 'step': 1546, 'epoch': 2} {'type': 'loss', 'content': 0.040080875158309937, 'timestamp': '2025-10-01 04:13:10.952569', 'step': 1547, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:10.999536', 'step': 1547, 'epoch': 2} {'type': 'loss', 'content': 0.01445924025028944, 'timestamp': '2025-10-01 04:13:11.031193', 'step': 1548, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:11.082481', 'step': 1548, 'epoch': 2} {'type': 'loss', 'content': 0.005054256413131952, 'timestamp': '2025-10-01 04:13:11.086663', 'step': 1549, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:11.131931', 'step': 1549, 'epoch': 2} {'type': 'loss', 'content': 0.0016825426137074828, 'timestamp': '2025-10-01 04:13:11.142622', 'step': 1550, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:11.193041', 'step': 1550, 'epoch': 2} {'type': 'loss', 'content': 0.01055573858320713, 'timestamp': '2025-10-01 04:13:11.200268', 'step': 1551, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:11.253270', 'step': 1551, 'epoch': 2} {'type': 'loss', 'content': 0.026796722784638405, 'timestamp': '2025-10-01 04:13:11.280372', 'step': 1552, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:11.322620', 'step': 1552, 'epoch': 2} {'type': 'loss', 'content': 0.005125401075929403, 'timestamp': '2025-10-01 04:13:11.327414', 'step': 1553, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:11.370658', 'step': 1553, 'epoch': 2} {'type': 'loss', 'content': 0.001129808253608644, 'timestamp': '2025-10-01 04:13:11.376362', 'step': 1554, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:11.427167', 'step': 1554, 'epoch': 2} {'type': 'loss', 'content': 0.05054508522152901, 'timestamp': '2025-10-01 04:13:11.432103', 'step': 1555, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:11.474371', 'step': 1555, 'epoch': 2} {'type': 'loss', 'content': 0.04554403945803642, 'timestamp': '2025-10-01 04:13:11.501745', 'step': 1556, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:11.540115', 'step': 1556, 'epoch': 2} {'type': 'loss', 'content': 0.0013658460229635239, 'timestamp': '2025-10-01 04:13:11.544415', 'step': 1557, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:11.586716', 'step': 1557, 'epoch': 2} {'type': 'loss', 'content': 0.004094754345715046, 'timestamp': '2025-10-01 04:13:11.594196', 'step': 1558, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:11.645190', 'step': 1558, 'epoch': 2} {'type': 'loss', 'content': 0.012671545147895813, 'timestamp': '2025-10-01 04:13:11.647484', 'step': 1559, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:11.685690', 'step': 1559, 'epoch': 2} {'type': 'loss', 'content': 0.0024722558446228504, 'timestamp': '2025-10-01 04:13:11.715832', 'step': 1560, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:11.765199', 'step': 1560, 'epoch': 2} {'type': 'loss', 'content': 0.040812183171510696, 'timestamp': '2025-10-01 04:13:11.773090', 'step': 1561, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:11.814801', 'step': 1561, 'epoch': 2} {'type': 'loss', 'content': 0.004918430000543594, 'timestamp': '2025-10-01 04:13:11.822778', 'step': 1562, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:11.873333', 'step': 1562, 'epoch': 2} {'type': 'loss', 'content': 0.0334562249481678, 'timestamp': '2025-10-01 04:13:11.879939', 'step': 1563, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:11.932231', 'step': 1563, 'epoch': 2} {'type': 'loss', 'content': 0.0065500554628670216, 'timestamp': '2025-10-01 04:13:11.957013', 'step': 1564, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:12.001221', 'step': 1564, 'epoch': 2} {'type': 'loss', 'content': 0.005265658255666494, 'timestamp': '2025-10-01 04:13:12.008699', 'step': 1565, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:12.046129', 'step': 1565, 'epoch': 2} {'type': 'loss', 'content': 0.005237952806055546, 'timestamp': '2025-10-01 04:13:12.049451', 'step': 1566, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:12.094927', 'step': 1566, 'epoch': 2} {'type': 'loss', 'content': 0.003421790199354291, 'timestamp': '2025-10-01 04:13:12.103295', 'step': 1567, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:12.144690', 'step': 1567, 'epoch': 2} {'type': 'loss', 'content': 0.02141939476132393, 'timestamp': '2025-10-01 04:13:12.172141', 'step': 1568, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:12.228523', 'step': 1568, 'epoch': 2} {'type': 'loss', 'content': 0.008104095235466957, 'timestamp': '2025-10-01 04:13:12.235493', 'step': 1569, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:12.280761', 'step': 1569, 'epoch': 2} {'type': 'loss', 'content': 0.01682603359222412, 'timestamp': '2025-10-01 04:13:12.287455', 'step': 1570, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:12.329120', 'step': 1570, 'epoch': 2} {'type': 'loss', 'content': 0.02583424746990204, 'timestamp': '2025-10-01 04:13:12.333111', 'step': 1571, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:12.382025', 'step': 1571, 'epoch': 2} {'type': 'loss', 'content': 0.011157028377056122, 'timestamp': '2025-10-01 04:13:12.410105', 'step': 1572, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:12.448660', 'step': 1572, 'epoch': 2} {'type': 'loss', 'content': 0.00611527357250452, 'timestamp': '2025-10-01 04:13:12.453210', 'step': 1573, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:12.497994', 'step': 1573, 'epoch': 2} {'type': 'loss', 'content': 0.0019314091186970472, 'timestamp': '2025-10-01 04:13:12.503769', 'step': 1574, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:12.550452', 'step': 1574, 'epoch': 2} {'type': 'loss', 'content': 0.021295342594385147, 'timestamp': '2025-10-01 04:13:12.561638', 'step': 1575, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:12.608409', 'step': 1575, 'epoch': 2} {'type': 'loss', 'content': 0.005156986881047487, 'timestamp': '2025-10-01 04:13:12.638922', 'step': 1576, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:12.681337', 'step': 1576, 'epoch': 2} {'type': 'loss', 'content': 0.011544455774128437, 'timestamp': '2025-10-01 04:13:12.688360', 'step': 1577, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:12.719960', 'step': 1577, 'epoch': 2} {'type': 'loss', 'content': 0.016041046008467674, 'timestamp': '2025-10-01 04:13:12.722928', 'step': 1578, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:12.767531', 'step': 1578, 'epoch': 2} {'type': 'loss', 'content': 0.004609879106283188, 'timestamp': '2025-10-01 04:13:12.771288', 'step': 1579, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:12.810722', 'step': 1579, 'epoch': 2} {'type': 'loss', 'content': 0.01365666277706623, 'timestamp': '2025-10-01 04:13:12.838008', 'step': 1580, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:12.877061', 'step': 1580, 'epoch': 2} {'type': 'loss', 'content': 0.01647232100367546, 'timestamp': '2025-10-01 04:13:12.880290', 'step': 1581, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:12.927247', 'step': 1581, 'epoch': 2} {'type': 'loss', 'content': 0.007607575505971909, 'timestamp': '2025-10-01 04:13:12.933615', 'step': 1582, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:12.979921', 'step': 1582, 'epoch': 2} {'type': 'loss', 'content': 0.006439111661165953, 'timestamp': '2025-10-01 04:13:12.986838', 'step': 1583, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:13.026906', 'step': 1583, 'epoch': 2} {'type': 'loss', 'content': 0.011291056871414185, 'timestamp': '2025-10-01 04:13:13.051543', 'step': 1584, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:13.093150', 'step': 1584, 'epoch': 2} {'type': 'loss', 'content': 0.010510903783142567, 'timestamp': '2025-10-01 04:13:13.099338', 'step': 1585, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:13.143499', 'step': 1585, 'epoch': 2} {'type': 'loss', 'content': 0.028885314241051674, 'timestamp': '2025-10-01 04:13:13.147899', 'step': 1586, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:13.188834', 'step': 1586, 'epoch': 2} {'type': 'loss', 'content': 0.03779543563723564, 'timestamp': '2025-10-01 04:13:13.195054', 'step': 1587, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:13.242634', 'step': 1587, 'epoch': 2} {'type': 'loss', 'content': 0.006501057185232639, 'timestamp': '2025-10-01 04:13:13.276221', 'step': 1588, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:13.313655', 'step': 1588, 'epoch': 2} {'type': 'loss', 'content': 0.017126483842730522, 'timestamp': '2025-10-01 04:13:13.320308', 'step': 1589, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:13.361042', 'step': 1589, 'epoch': 2} {'type': 'loss', 'content': 0.01926274411380291, 'timestamp': '2025-10-01 04:13:13.367872', 'step': 1590, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:13.406104', 'step': 1590, 'epoch': 2} {'type': 'loss', 'content': 0.026384757831692696, 'timestamp': '2025-10-01 04:13:13.412951', 'step': 1591, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:13.457903', 'step': 1591, 'epoch': 2} {'type': 'loss', 'content': 0.037959903478622437, 'timestamp': '2025-10-01 04:13:13.482434', 'step': 1592, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:13.520971', 'step': 1592, 'epoch': 2} {'type': 'loss', 'content': 0.008754936046898365, 'timestamp': '2025-10-01 04:13:13.526987', 'step': 1593, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:13.575983', 'step': 1593, 'epoch': 2} {'type': 'loss', 'content': 0.01358555257320404, 'timestamp': '2025-10-01 04:13:13.582625', 'step': 1594, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:13.628764', 'step': 1594, 'epoch': 2} {'type': 'loss', 'content': 0.011499091051518917, 'timestamp': '2025-10-01 04:13:13.634773', 'step': 1595, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:13.681097', 'step': 1595, 'epoch': 2} {'type': 'loss', 'content': 0.014255395159125328, 'timestamp': '2025-10-01 04:13:13.704974', 'step': 1596, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:13:14.463635', 'step': 1596, 'epoch': 2} {'type': 'pplx', 'content': 52347787.02300448, 'timestamp': '2025-10-01 04:13:14.467134', 'step': 1596, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:14.497190', 'step': 1596, 'epoch': 2} {'type': 'loss', 'content': 0.008003183640539646, 'timestamp': '2025-10-01 04:13:14.499343', 'step': 1597, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:14.530125', 'step': 1597, 'epoch': 2} {'type': 'loss', 'content': 0.009980769827961922, 'timestamp': '2025-10-01 04:13:14.532732', 'step': 1598, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:14.563640', 'step': 1598, 'epoch': 2} {'type': 'loss', 'content': 0.017090851441025734, 'timestamp': '2025-10-01 04:13:14.565998', 'step': 1599, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:14.597935', 'step': 1599, 'epoch': 2} {'type': 'loss', 'content': 0.011963611468672752, 'timestamp': '2025-10-01 04:13:14.621268', 'step': 1600, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:14.654546', 'step': 1600, 'epoch': 2} {'type': 'loss', 'content': 0.006779204122722149, 'timestamp': '2025-10-01 04:13:14.656764', 'step': 1601, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:14.690170', 'step': 1601, 'epoch': 2} {'type': 'loss', 'content': 0.025302747264504433, 'timestamp': '2025-10-01 04:13:14.692544', 'step': 1602, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:14.726423', 'step': 1602, 'epoch': 2} {'type': 'loss', 'content': 0.017179856076836586, 'timestamp': '2025-10-01 04:13:14.728666', 'step': 1603, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:14.762226', 'step': 1603, 'epoch': 2} {'type': 'loss', 'content': 0.005239727441221476, 'timestamp': '2025-10-01 04:13:14.786399', 'step': 1604, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:14.818482', 'step': 1604, 'epoch': 2} {'type': 'loss', 'content': 0.00681100320070982, 'timestamp': '2025-10-01 04:13:14.820686', 'step': 1605, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:14.851705', 'step': 1605, 'epoch': 2} {'type': 'loss', 'content': 0.031561676412820816, 'timestamp': '2025-10-01 04:13:14.854625', 'step': 1606, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:14.885921', 'step': 1606, 'epoch': 2} {'type': 'loss', 'content': 0.008977233432233334, 'timestamp': '2025-10-01 04:13:14.888366', 'step': 1607, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:14.919498', 'step': 1607, 'epoch': 2} {'type': 'loss', 'content': 0.010205530561506748, 'timestamp': '2025-10-01 04:13:14.943001', 'step': 1608, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:14.973817', 'step': 1608, 'epoch': 2} {'type': 'loss', 'content': 0.007750756572932005, 'timestamp': '2025-10-01 04:13:14.976302', 'step': 1609, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:15.008690', 'step': 1609, 'epoch': 2} {'type': 'loss', 'content': 0.027499424293637276, 'timestamp': '2025-10-01 04:13:15.010962', 'step': 1610, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:15.041843', 'step': 1610, 'epoch': 2} {'type': 'loss', 'content': 0.04024529829621315, 'timestamp': '2025-10-01 04:13:15.044051', 'step': 1611, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:15.074969', 'step': 1611, 'epoch': 2} {'type': 'loss', 'content': 0.006539492402225733, 'timestamp': '2025-10-01 04:13:15.098707', 'step': 1612, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:15.131453', 'step': 1612, 'epoch': 2} {'type': 'loss', 'content': 0.01586497202515602, 'timestamp': '2025-10-01 04:13:15.134249', 'step': 1613, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:15.166162', 'step': 1613, 'epoch': 2} {'type': 'loss', 'content': 0.007332774344831705, 'timestamp': '2025-10-01 04:13:15.168579', 'step': 1614, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:15.200232', 'step': 1614, 'epoch': 2} {'type': 'loss', 'content': 0.0067348359152674675, 'timestamp': '2025-10-01 04:13:15.202679', 'step': 1615, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:15.233556', 'step': 1615, 'epoch': 2} {'type': 'loss', 'content': 0.020247790962457657, 'timestamp': '2025-10-01 04:13:15.257629', 'step': 1616, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:15.288443', 'step': 1616, 'epoch': 2} {'type': 'loss', 'content': 0.020015602931380272, 'timestamp': '2025-10-01 04:13:15.290865', 'step': 1617, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:15.321155', 'step': 1617, 'epoch': 2} {'type': 'loss', 'content': 0.007512770593166351, 'timestamp': '2025-10-01 04:13:15.323394', 'step': 1618, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:15.353553', 'step': 1618, 'epoch': 2} {'type': 'loss', 'content': 0.013618254102766514, 'timestamp': '2025-10-01 04:13:15.355771', 'step': 1619, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:15.386919', 'step': 1619, 'epoch': 2} {'type': 'loss', 'content': 0.013072640635073185, 'timestamp': '2025-10-01 04:13:15.410631', 'step': 1620, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:15.441777', 'step': 1620, 'epoch': 2} {'type': 'loss', 'content': 0.00882615428417921, 'timestamp': '2025-10-01 04:13:15.445282', 'step': 1621, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:15.476808', 'step': 1621, 'epoch': 2} {'type': 'loss', 'content': 0.010403069667518139, 'timestamp': '2025-10-01 04:13:15.479390', 'step': 1622, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:15.513491', 'step': 1622, 'epoch': 2} {'type': 'loss', 'content': 0.0030257641337811947, 'timestamp': '2025-10-01 04:13:15.515779', 'step': 1623, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:15.546806', 'step': 1623, 'epoch': 2} {'type': 'loss', 'content': 0.004086515866219997, 'timestamp': '2025-10-01 04:13:15.570626', 'step': 1624, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:15.600868', 'step': 1624, 'epoch': 2} {'type': 'loss', 'content': 0.010360230691730976, 'timestamp': '2025-10-01 04:13:15.603087', 'step': 1625, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:15.633799', 'step': 1625, 'epoch': 2} {'type': 'loss', 'content': 0.006260990630835295, 'timestamp': '2025-10-01 04:13:15.635965', 'step': 1626, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:15.666052', 'step': 1626, 'epoch': 2} {'type': 'loss', 'content': 0.047125279903411865, 'timestamp': '2025-10-01 04:13:15.668362', 'step': 1627, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:13:15.700288', 'step': 1627, 'epoch': 2} {'type': 'loss', 'content': 0.008346055634319782, 'timestamp': '2025-10-01 04:13:15.724522', 'step': 1628, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:15.757788', 'step': 1628, 'epoch': 2} {'type': 'loss', 'content': 0.017164109274744987, 'timestamp': '2025-10-01 04:13:15.761373', 'step': 1629, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:15.794440', 'step': 1629, 'epoch': 2} {'type': 'loss', 'content': 0.008173419162631035, 'timestamp': '2025-10-01 04:13:15.797100', 'step': 1630, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:15.827211', 'step': 1630, 'epoch': 2} {'type': 'loss', 'content': 0.008467881008982658, 'timestamp': '2025-10-01 04:13:15.829571', 'step': 1631, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:15.859988', 'step': 1631, 'epoch': 2} {'type': 'loss', 'content': 0.03314581885933876, 'timestamp': '2025-10-01 04:13:15.883874', 'step': 1632, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:15.914163', 'step': 1632, 'epoch': 2} {'type': 'loss', 'content': 0.022746745496988297, 'timestamp': '2025-10-01 04:13:15.916275', 'step': 1633, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:15.946825', 'step': 1633, 'epoch': 2} {'type': 'loss', 'content': 0.0051367864944040775, 'timestamp': '2025-10-01 04:13:15.948975', 'step': 1634, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:15.979537', 'step': 1634, 'epoch': 2} {'type': 'loss', 'content': 0.009908909909427166, 'timestamp': '2025-10-01 04:13:15.981783', 'step': 1635, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:16.013665', 'step': 1635, 'epoch': 2} {'type': 'loss', 'content': 0.006155069451779127, 'timestamp': '2025-10-01 04:13:16.037572', 'step': 1636, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:16.068168', 'step': 1636, 'epoch': 2} {'type': 'loss', 'content': 0.0286524947732687, 'timestamp': '2025-10-01 04:13:16.070394', 'step': 1637, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:16.101977', 'step': 1637, 'epoch': 2} {'type': 'loss', 'content': 0.0015293165342882276, 'timestamp': '2025-10-01 04:13:16.119786', 'step': 1638, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:16.152532', 'step': 1638, 'epoch': 2} {'type': 'loss', 'content': 0.0018288606079295278, 'timestamp': '2025-10-01 04:13:16.160485', 'step': 1639, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:16.196914', 'step': 1639, 'epoch': 2} {'type': 'loss', 'content': 0.0069273049011826515, 'timestamp': '2025-10-01 04:13:16.232303', 'step': 1640, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:16.263740', 'step': 1640, 'epoch': 2} {'type': 'loss', 'content': 0.02906244620680809, 'timestamp': '2025-10-01 04:13:16.266019', 'step': 1641, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:16.300037', 'step': 1641, 'epoch': 2} {'type': 'loss', 'content': 0.03223400190472603, 'timestamp': '2025-10-01 04:13:16.317374', 'step': 1642, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:16.364037', 'step': 1642, 'epoch': 2} {'type': 'loss', 'content': 0.01042555458843708, 'timestamp': '2025-10-01 04:13:16.366762', 'step': 1643, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:16.413526', 'step': 1643, 'epoch': 2} {'type': 'loss', 'content': 0.011268666945397854, 'timestamp': '2025-10-01 04:13:16.438019', 'step': 1644, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:16.476833', 'step': 1644, 'epoch': 2} {'type': 'loss', 'content': 0.004011114593595266, 'timestamp': '2025-10-01 04:13:16.480232', 'step': 1645, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:16.524681', 'step': 1645, 'epoch': 2} {'type': 'loss', 'content': 0.007196138612926006, 'timestamp': '2025-10-01 04:13:16.533262', 'step': 1646, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:16.570305', 'step': 1646, 'epoch': 2} {'type': 'loss', 'content': 0.014080160297453403, 'timestamp': '2025-10-01 04:13:16.573389', 'step': 1647, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:16.609290', 'step': 1647, 'epoch': 2} {'type': 'loss', 'content': 0.0066165500320494175, 'timestamp': '2025-10-01 04:13:16.637774', 'step': 1648, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:16.670959', 'step': 1648, 'epoch': 2} {'type': 'loss', 'content': 0.023716462776064873, 'timestamp': '2025-10-01 04:13:16.675706', 'step': 1649, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:16.714798', 'step': 1649, 'epoch': 2} {'type': 'loss', 'content': 0.00808466225862503, 'timestamp': '2025-10-01 04:13:16.717082', 'step': 1650, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:16.755241', 'step': 1650, 'epoch': 2} {'type': 'loss', 'content': 0.0066062286496162415, 'timestamp': '2025-10-01 04:13:16.757816', 'step': 1651, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:16.815548', 'step': 1651, 'epoch': 2} {'type': 'loss', 'content': 0.005531540606170893, 'timestamp': '2025-10-01 04:13:16.842589', 'step': 1652, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:16.886964', 'step': 1652, 'epoch': 2} {'type': 'loss', 'content': 0.012811918742954731, 'timestamp': '2025-10-01 04:13:16.889840', 'step': 1653, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:13:17.749843', 'step': 1653, 'epoch': 2} {'type': 'pplx', 'content': 56661103.4377224, 'timestamp': '2025-10-01 04:13:17.752192', 'step': 1653, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:17.782199', 'step': 1653, 'epoch': 2} {'type': 'loss', 'content': 0.018863413482904434, 'timestamp': '2025-10-01 04:13:17.784596', 'step': 1654, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:17.818028', 'step': 1654, 'epoch': 2} {'type': 'loss', 'content': 0.007815408520400524, 'timestamp': '2025-10-01 04:13:17.820306', 'step': 1655, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:17.850419', 'step': 1655, 'epoch': 2} {'type': 'loss', 'content': 0.0063620880246162415, 'timestamp': '2025-10-01 04:13:17.874417', 'step': 1656, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:17.905300', 'step': 1656, 'epoch': 2} {'type': 'loss', 'content': 0.020800398662686348, 'timestamp': '2025-10-01 04:13:17.907589', 'step': 1657, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:17.938776', 'step': 1657, 'epoch': 2} {'type': 'loss', 'content': 0.02517288736999035, 'timestamp': '2025-10-01 04:13:17.940894', 'step': 1658, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:17.971646', 'step': 1658, 'epoch': 2} {'type': 'loss', 'content': 0.010495798662304878, 'timestamp': '2025-10-01 04:13:17.974010', 'step': 1659, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:18.004415', 'step': 1659, 'epoch': 2} {'type': 'loss', 'content': 0.0034333032090216875, 'timestamp': '2025-10-01 04:13:18.028138', 'step': 1660, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:18.058615', 'step': 1660, 'epoch': 2} {'type': 'loss', 'content': 0.04161619767546654, 'timestamp': '2025-10-01 04:13:18.061050', 'step': 1661, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:18.092475', 'step': 1661, 'epoch': 2} {'type': 'loss', 'content': 0.01029189396649599, 'timestamp': '2025-10-01 04:13:18.094882', 'step': 1662, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:18.124991', 'step': 1662, 'epoch': 2} {'type': 'loss', 'content': 0.004730475600808859, 'timestamp': '2025-10-01 04:13:18.127165', 'step': 1663, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:18.158450', 'step': 1663, 'epoch': 2} {'type': 'loss', 'content': 0.005704338662326336, 'timestamp': '2025-10-01 04:13:18.182232', 'step': 1664, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:18.216043', 'step': 1664, 'epoch': 2} {'type': 'loss', 'content': 0.013141750358045101, 'timestamp': '2025-10-01 04:13:18.218090', 'step': 1665, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:18.249036', 'step': 1665, 'epoch': 2} {'type': 'loss', 'content': 0.015302762389183044, 'timestamp': '2025-10-01 04:13:18.251200', 'step': 1666, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:18.282261', 'step': 1666, 'epoch': 2} {'type': 'loss', 'content': 0.016241682693362236, 'timestamp': '2025-10-01 04:13:18.284567', 'step': 1667, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:18.316231', 'step': 1667, 'epoch': 2} {'type': 'loss', 'content': 0.0158796738833189, 'timestamp': '2025-10-01 04:13:18.341517', 'step': 1668, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:18.372234', 'step': 1668, 'epoch': 2} {'type': 'loss', 'content': 0.03702926263213158, 'timestamp': '2025-10-01 04:13:18.374471', 'step': 1669, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:18.404827', 'step': 1669, 'epoch': 2} {'type': 'loss', 'content': 0.011859017424285412, 'timestamp': '2025-10-01 04:13:18.406941', 'step': 1670, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:18.440403', 'step': 1670, 'epoch': 2} {'type': 'loss', 'content': 0.012553932145237923, 'timestamp': '2025-10-01 04:13:18.442586', 'step': 1671, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:18.472924', 'step': 1671, 'epoch': 2} {'type': 'loss', 'content': 0.01608966663479805, 'timestamp': '2025-10-01 04:13:18.496912', 'step': 1672, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:18.531887', 'step': 1672, 'epoch': 2} {'type': 'loss', 'content': 0.010410645976662636, 'timestamp': '2025-10-01 04:13:18.534089', 'step': 1673, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:18.565628', 'step': 1673, 'epoch': 2} {'type': 'loss', 'content': 0.003722522873431444, 'timestamp': '2025-10-01 04:13:18.568039', 'step': 1674, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:18.599519', 'step': 1674, 'epoch': 2} {'type': 'loss', 'content': 0.0032592397183179855, 'timestamp': '2025-10-01 04:13:18.601890', 'step': 1675, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:18.634145', 'step': 1675, 'epoch': 2} {'type': 'loss', 'content': 0.002561945701017976, 'timestamp': '2025-10-01 04:13:18.659400', 'step': 1676, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:18.694416', 'step': 1676, 'epoch': 2} {'type': 'loss', 'content': 0.008923517540097237, 'timestamp': '2025-10-01 04:13:18.696491', 'step': 1677, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:18.729157', 'step': 1677, 'epoch': 2} {'type': 'loss', 'content': 0.005183252971619368, 'timestamp': '2025-10-01 04:13:18.732975', 'step': 1678, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:13:18.770986', 'step': 1678, 'epoch': 2} {'type': 'loss', 'content': 0.019529495388269424, 'timestamp': '2025-10-01 04:13:18.774215', 'step': 1679, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:18.812820', 'step': 1679, 'epoch': 2} {'type': 'loss', 'content': 0.018010234460234642, 'timestamp': '2025-10-01 04:13:18.838502', 'step': 1680, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:18.871249', 'step': 1680, 'epoch': 2} {'type': 'loss', 'content': 0.004157633520662785, 'timestamp': '2025-10-01 04:13:18.873334', 'step': 1681, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:18.905518', 'step': 1681, 'epoch': 2} {'type': 'loss', 'content': 0.008690069429576397, 'timestamp': '2025-10-01 04:13:18.908634', 'step': 1682, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:18.939282', 'step': 1682, 'epoch': 2} {'type': 'loss', 'content': 0.005145031027495861, 'timestamp': '2025-10-01 04:13:18.941810', 'step': 1683, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:18.973616', 'step': 1683, 'epoch': 2} {'type': 'loss', 'content': 0.0030300146900117397, 'timestamp': '2025-10-01 04:13:18.997599', 'step': 1684, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.029414', 'step': 1684, 'epoch': 2} {'type': 'loss', 'content': 0.025381917133927345, 'timestamp': '2025-10-01 04:13:19.031698', 'step': 1685, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.062812', 'step': 1685, 'epoch': 2} {'type': 'loss', 'content': 0.005756060127168894, 'timestamp': '2025-10-01 04:13:19.064963', 'step': 1686, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.096585', 'step': 1686, 'epoch': 2} {'type': 'loss', 'content': 0.03418432176113129, 'timestamp': '2025-10-01 04:13:19.099015', 'step': 1687, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:19.129810', 'step': 1687, 'epoch': 2} {'type': 'loss', 'content': 0.04978889226913452, 'timestamp': '2025-10-01 04:13:19.153300', 'step': 1688, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:19.185013', 'step': 1688, 'epoch': 2} {'type': 'loss', 'content': 0.001084079034626484, 'timestamp': '2025-10-01 04:13:19.187406', 'step': 1689, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.218692', 'step': 1689, 'epoch': 2} {'type': 'loss', 'content': 0.05039183050394058, 'timestamp': '2025-10-01 04:13:19.220944', 'step': 1690, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.252295', 'step': 1690, 'epoch': 2} {'type': 'loss', 'content': 0.00032829755218699574, 'timestamp': '2025-10-01 04:13:19.254419', 'step': 1691, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:19.285597', 'step': 1691, 'epoch': 2} {'type': 'loss', 'content': 0.027190400287508965, 'timestamp': '2025-10-01 04:13:19.310401', 'step': 1692, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:19.341228', 'step': 1692, 'epoch': 2} {'type': 'loss', 'content': 0.013346688821911812, 'timestamp': '2025-10-01 04:13:19.343899', 'step': 1693, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:19.375400', 'step': 1693, 'epoch': 2} {'type': 'loss', 'content': 0.0057144020684063435, 'timestamp': '2025-10-01 04:13:19.378385', 'step': 1694, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.409704', 'step': 1694, 'epoch': 2} {'type': 'loss', 'content': 0.0006181415519677103, 'timestamp': '2025-10-01 04:13:19.412020', 'step': 1695, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:19.443084', 'step': 1695, 'epoch': 2} {'type': 'loss', 'content': 0.05258602276444435, 'timestamp': '2025-10-01 04:13:19.467230', 'step': 1696, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.498307', 'step': 1696, 'epoch': 2} {'type': 'loss', 'content': 0.002812599530443549, 'timestamp': '2025-10-01 04:13:19.500894', 'step': 1697, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.533068', 'step': 1697, 'epoch': 2} {'type': 'loss', 'content': 0.012100943364202976, 'timestamp': '2025-10-01 04:13:19.536128', 'step': 1698, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:19.567213', 'step': 1698, 'epoch': 2} {'type': 'loss', 'content': 0.0027744087856262922, 'timestamp': '2025-10-01 04:13:19.569264', 'step': 1699, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.599965', 'step': 1699, 'epoch': 2} {'type': 'loss', 'content': 0.012924875132739544, 'timestamp': '2025-10-01 04:13:19.623715', 'step': 1700, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:19.654117', 'step': 1700, 'epoch': 2} {'type': 'loss', 'content': 0.013080582022666931, 'timestamp': '2025-10-01 04:13:19.656267', 'step': 1701, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.686977', 'step': 1701, 'epoch': 2} {'type': 'loss', 'content': 0.0027978557627648115, 'timestamp': '2025-10-01 04:13:19.689059', 'step': 1702, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:19.720209', 'step': 1702, 'epoch': 2} {'type': 'loss', 'content': 0.0007439813925884664, 'timestamp': '2025-10-01 04:13:19.722717', 'step': 1703, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.753258', 'step': 1703, 'epoch': 2} {'type': 'loss', 'content': 0.000717491318937391, 'timestamp': '2025-10-01 04:13:19.777331', 'step': 1704, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.807758', 'step': 1704, 'epoch': 2} {'type': 'loss', 'content': 0.007695492822676897, 'timestamp': '2025-10-01 04:13:19.809982', 'step': 1705, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.840333', 'step': 1705, 'epoch': 2} {'type': 'loss', 'content': 0.01251158770173788, 'timestamp': '2025-10-01 04:13:19.842649', 'step': 1706, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:19.873050', 'step': 1706, 'epoch': 2} {'type': 'loss', 'content': 0.0007180058746598661, 'timestamp': '2025-10-01 04:13:19.875098', 'step': 1707, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.909658', 'step': 1707, 'epoch': 2} {'type': 'loss', 'content': 0.018272066488862038, 'timestamp': '2025-10-01 04:13:19.934124', 'step': 1708, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.965005', 'step': 1708, 'epoch': 2} {'type': 'loss', 'content': 0.0062143513932824135, 'timestamp': '2025-10-01 04:13:19.967004', 'step': 1709, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:19.997291', 'step': 1709, 'epoch': 2} {'type': 'loss', 'content': 0.003083524527028203, 'timestamp': '2025-10-01 04:13:19.999376', 'step': 1710, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:13:20.738670', 'step': 1710, 'epoch': 2} {'type': 'pplx', 'content': 68561080.48393603, 'timestamp': '2025-10-01 04:13:20.740900', 'step': 1710, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:20.770926', 'step': 1710, 'epoch': 2} {'type': 'loss', 'content': 0.023226430639624596, 'timestamp': '2025-10-01 04:13:20.773223', 'step': 1711, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:20.804471', 'step': 1711, 'epoch': 2} {'type': 'loss', 'content': 0.019120950251817703, 'timestamp': '2025-10-01 04:13:20.828991', 'step': 1712, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:20.867736', 'step': 1712, 'epoch': 2} {'type': 'loss', 'content': 0.005029122345149517, 'timestamp': '2025-10-01 04:13:20.869929', 'step': 1713, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:20.900792', 'step': 1713, 'epoch': 2} {'type': 'loss', 'content': 0.0059706405736505985, 'timestamp': '2025-10-01 04:13:20.903081', 'step': 1714, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:20.934588', 'step': 1714, 'epoch': 2} {'type': 'loss', 'content': 0.003695933148264885, 'timestamp': '2025-10-01 04:13:20.936922', 'step': 1715, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:20.967642', 'step': 1715, 'epoch': 2} {'type': 'loss', 'content': 0.009229108691215515, 'timestamp': '2025-10-01 04:13:20.991312', 'step': 1716, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:21.023899', 'step': 1716, 'epoch': 2} {'type': 'loss', 'content': 0.032100412994623184, 'timestamp': '2025-10-01 04:13:21.026151', 'step': 1717, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:21.059084', 'step': 1717, 'epoch': 2} {'type': 'loss', 'content': 0.012853546068072319, 'timestamp': '2025-10-01 04:13:21.061434', 'step': 1718, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:21.091500', 'step': 1718, 'epoch': 2} {'type': 'loss', 'content': 0.01731124520301819, 'timestamp': '2025-10-01 04:13:21.094051', 'step': 1719, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:21.124812', 'step': 1719, 'epoch': 2} {'type': 'loss', 'content': 0.023197930306196213, 'timestamp': '2025-10-01 04:13:21.148388', 'step': 1720, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:21.180463', 'step': 1720, 'epoch': 2} {'type': 'loss', 'content': 0.005342247895896435, 'timestamp': '2025-10-01 04:13:21.182601', 'step': 1721, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:21.213459', 'step': 1721, 'epoch': 2} {'type': 'loss', 'content': 0.00981935765594244, 'timestamp': '2025-10-01 04:13:21.215613', 'step': 1722, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:21.246660', 'step': 1722, 'epoch': 2} {'type': 'loss', 'content': 0.00958984438329935, 'timestamp': '2025-10-01 04:13:21.249205', 'step': 1723, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:21.280733', 'step': 1723, 'epoch': 2} {'type': 'loss', 'content': 0.006270966958254576, 'timestamp': '2025-10-01 04:13:21.304568', 'step': 1724, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:21.336570', 'step': 1724, 'epoch': 2} {'type': 'loss', 'content': 0.0034877455327659845, 'timestamp': '2025-10-01 04:13:21.338909', 'step': 1725, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:21.369321', 'step': 1725, 'epoch': 2} {'type': 'loss', 'content': 0.010010543279349804, 'timestamp': '2025-10-01 04:13:21.371830', 'step': 1726, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:21.402628', 'step': 1726, 'epoch': 2} {'type': 'loss', 'content': 0.01026623509824276, 'timestamp': '2025-10-01 04:13:21.405007', 'step': 1727, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:21.435258', 'step': 1727, 'epoch': 2} {'type': 'loss', 'content': 0.0038282847963273525, 'timestamp': '2025-10-01 04:13:21.459183', 'step': 1728, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:21.489333', 'step': 1728, 'epoch': 2} {'type': 'loss', 'content': 0.007716696243733168, 'timestamp': '2025-10-01 04:13:21.491842', 'step': 1729, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:21.521786', 'step': 1729, 'epoch': 2} {'type': 'loss', 'content': 0.013947995379567146, 'timestamp': '2025-10-01 04:13:21.523495', 'step': 1730, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:21.554055', 'step': 1730, 'epoch': 2} {'type': 'loss', 'content': 0.008495950140058994, 'timestamp': '2025-10-01 04:13:21.556053', 'step': 1731, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:21.588730', 'step': 1731, 'epoch': 2} {'type': 'loss', 'content': 0.0012179531622678041, 'timestamp': '2025-10-01 04:13:21.612653', 'step': 1732, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:21.644163', 'step': 1732, 'epoch': 2} {'type': 'loss', 'content': 0.004571664612740278, 'timestamp': '2025-10-01 04:13:21.646329', 'step': 1733, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:21.677325', 'step': 1733, 'epoch': 2} {'type': 'loss', 'content': 0.010332281701266766, 'timestamp': '2025-10-01 04:13:21.680016', 'step': 1734, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:21.711530', 'step': 1734, 'epoch': 2} {'type': 'loss', 'content': 0.004366362001746893, 'timestamp': '2025-10-01 04:13:21.713653', 'step': 1735, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:21.750472', 'step': 1735, 'epoch': 2} {'type': 'loss', 'content': 0.006593588273972273, 'timestamp': '2025-10-01 04:13:21.774349', 'step': 1736, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:21.805952', 'step': 1736, 'epoch': 2} {'type': 'loss', 'content': 0.011689740233123302, 'timestamp': '2025-10-01 04:13:21.810334', 'step': 1737, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:21.844644', 'step': 1737, 'epoch': 2} {'type': 'loss', 'content': 0.01590937003493309, 'timestamp': '2025-10-01 04:13:21.847777', 'step': 1738, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:21.890165', 'step': 1738, 'epoch': 2} {'type': 'loss', 'content': 0.0026656892150640488, 'timestamp': '2025-10-01 04:13:21.895407', 'step': 1739, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:21.931053', 'step': 1739, 'epoch': 2} {'type': 'loss', 'content': 0.0040006088092923164, 'timestamp': '2025-10-01 04:13:21.954962', 'step': 1740, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:21.990483', 'step': 1740, 'epoch': 2} {'type': 'loss', 'content': 0.02416076697409153, 'timestamp': '2025-10-01 04:13:21.992541', 'step': 1741, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:22.024682', 'step': 1741, 'epoch': 2} {'type': 'loss', 'content': 0.006946610752493143, 'timestamp': '2025-10-01 04:13:22.027259', 'step': 1742, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:22.060826', 'step': 1742, 'epoch': 2} {'type': 'loss', 'content': 0.010857383720576763, 'timestamp': '2025-10-01 04:13:22.064105', 'step': 1743, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:22.095093', 'step': 1743, 'epoch': 2} {'type': 'loss', 'content': 0.02855035290122032, 'timestamp': '2025-10-01 04:13:22.119146', 'step': 1744, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:22.150609', 'step': 1744, 'epoch': 2} {'type': 'loss', 'content': 0.0031122262589633465, 'timestamp': '2025-10-01 04:13:22.154174', 'step': 1745, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:22.186931', 'step': 1745, 'epoch': 2} {'type': 'loss', 'content': 0.0025793740060180426, 'timestamp': '2025-10-01 04:13:22.189117', 'step': 1746, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:22.219949', 'step': 1746, 'epoch': 2} {'type': 'loss', 'content': 0.002730799140408635, 'timestamp': '2025-10-01 04:13:22.222250', 'step': 1747, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:22.253837', 'step': 1747, 'epoch': 2} {'type': 'loss', 'content': 0.0016422842163592577, 'timestamp': '2025-10-01 04:13:22.278145', 'step': 1748, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:22.313150', 'step': 1748, 'epoch': 2} {'type': 'loss', 'content': 0.0013755019754171371, 'timestamp': '2025-10-01 04:13:22.315281', 'step': 1749, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:22.347030', 'step': 1749, 'epoch': 2} {'type': 'loss', 'content': 0.005876249633729458, 'timestamp': '2025-10-01 04:13:22.349291', 'step': 1750, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:22.381730', 'step': 1750, 'epoch': 2} {'type': 'loss', 'content': 0.013547523878514767, 'timestamp': '2025-10-01 04:13:22.383937', 'step': 1751, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:22.417666', 'step': 1751, 'epoch': 2} {'type': 'loss', 'content': 0.0024058823473751545, 'timestamp': '2025-10-01 04:13:22.441879', 'step': 1752, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:22.477321', 'step': 1752, 'epoch': 2} {'type': 'loss', 'content': 0.008786443620920181, 'timestamp': '2025-10-01 04:13:22.481635', 'step': 1753, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:22.514109', 'step': 1753, 'epoch': 2} {'type': 'loss', 'content': 0.037695612758398056, 'timestamp': '2025-10-01 04:13:22.520794', 'step': 1754, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:22.553046', 'step': 1754, 'epoch': 2} {'type': 'loss', 'content': 0.006795606575906277, 'timestamp': '2025-10-01 04:13:22.556391', 'step': 1755, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:22.591832', 'step': 1755, 'epoch': 2} {'type': 'loss', 'content': 0.01414434053003788, 'timestamp': '2025-10-01 04:13:22.615965', 'step': 1756, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:22.648091', 'step': 1756, 'epoch': 2} {'type': 'loss', 'content': 0.0011864519910886884, 'timestamp': '2025-10-01 04:13:22.650351', 'step': 1757, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:22.685759', 'step': 1757, 'epoch': 2} {'type': 'loss', 'content': 0.003651248523965478, 'timestamp': '2025-10-01 04:13:22.688110', 'step': 1758, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:22.718855', 'step': 1758, 'epoch': 2} {'type': 'loss', 'content': 0.004483689088374376, 'timestamp': '2025-10-01 04:13:22.722450', 'step': 1759, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:22.752697', 'step': 1759, 'epoch': 2} {'type': 'loss', 'content': 0.01843567192554474, 'timestamp': '2025-10-01 04:13:22.777064', 'step': 1760, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:22.807820', 'step': 1760, 'epoch': 2} {'type': 'loss', 'content': 0.006289682351052761, 'timestamp': '2025-10-01 04:13:22.811725', 'step': 1761, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:22.843814', 'step': 1761, 'epoch': 2} {'type': 'loss', 'content': 0.0012012216029688716, 'timestamp': '2025-10-01 04:13:22.846337', 'step': 1762, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:22.877408', 'step': 1762, 'epoch': 2} {'type': 'loss', 'content': 0.0004856856248807162, 'timestamp': '2025-10-01 04:13:22.879897', 'step': 1763, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:22.911346', 'step': 1763, 'epoch': 2} {'type': 'loss', 'content': 0.007547799032181501, 'timestamp': '2025-10-01 04:13:22.935664', 'step': 1764, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:22.967058', 'step': 1764, 'epoch': 2} {'type': 'loss', 'content': 0.007898692972958088, 'timestamp': '2025-10-01 04:13:22.969615', 'step': 1765, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:23.000624', 'step': 1765, 'epoch': 2} {'type': 'loss', 'content': 0.000819383654743433, 'timestamp': '2025-10-01 04:13:23.003621', 'step': 1766, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:23.042682', 'step': 1766, 'epoch': 2} {'type': 'loss', 'content': 0.0007117543718777597, 'timestamp': '2025-10-01 04:13:23.045222', 'step': 1767, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:13:23.808408', 'step': 1767, 'epoch': 2} {'type': 'pplx', 'content': 68241018.03120764, 'timestamp': '2025-10-01 04:13:23.810763', 'step': 1767, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:23.840481', 'step': 1767, 'epoch': 2} {'type': 'loss', 'content': 0.0007178504602052271, 'timestamp': '2025-10-01 04:13:23.865251', 'step': 1768, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:23.897832', 'step': 1768, 'epoch': 2} {'type': 'loss', 'content': 0.0072966525331139565, 'timestamp': '2025-10-01 04:13:23.901304', 'step': 1769, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:23.934168', 'step': 1769, 'epoch': 2} {'type': 'loss', 'content': 0.0020607777405530214, 'timestamp': '2025-10-01 04:13:23.937030', 'step': 1770, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:23.969218', 'step': 1770, 'epoch': 2} {'type': 'loss', 'content': 0.0053713517263531685, 'timestamp': '2025-10-01 04:13:23.971527', 'step': 1771, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.003104', 'step': 1771, 'epoch': 2} {'type': 'loss', 'content': 0.00041605104343034327, 'timestamp': '2025-10-01 04:13:24.028077', 'step': 1772, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.060185', 'step': 1772, 'epoch': 2} {'type': 'loss', 'content': 0.0013556292979046702, 'timestamp': '2025-10-01 04:13:24.062501', 'step': 1773, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.094690', 'step': 1773, 'epoch': 2} {'type': 'loss', 'content': 0.008703082799911499, 'timestamp': '2025-10-01 04:13:24.096944', 'step': 1774, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:24.129949', 'step': 1774, 'epoch': 2} {'type': 'loss', 'content': 0.012519395910203457, 'timestamp': '2025-10-01 04:13:24.132520', 'step': 1775, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.164454', 'step': 1775, 'epoch': 2} {'type': 'loss', 'content': 0.002101104473695159, 'timestamp': '2025-10-01 04:13:24.188362', 'step': 1776, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.221822', 'step': 1776, 'epoch': 2} {'type': 'loss', 'content': 0.0025045033544301987, 'timestamp': '2025-10-01 04:13:24.224252', 'step': 1777, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.256233', 'step': 1777, 'epoch': 2} {'type': 'loss', 'content': 0.001200907165184617, 'timestamp': '2025-10-01 04:13:24.258386', 'step': 1778, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:24.290209', 'step': 1778, 'epoch': 2} {'type': 'loss', 'content': 0.0042821685783565044, 'timestamp': '2025-10-01 04:13:24.292482', 'step': 1779, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.323204', 'step': 1779, 'epoch': 2} {'type': 'loss', 'content': 0.001926648779772222, 'timestamp': '2025-10-01 04:13:24.347615', 'step': 1780, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.379004', 'step': 1780, 'epoch': 2} {'type': 'loss', 'content': 0.0010534462053328753, 'timestamp': '2025-10-01 04:13:24.381470', 'step': 1781, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.412959', 'step': 1781, 'epoch': 2} {'type': 'loss', 'content': 0.003271149704232812, 'timestamp': '2025-10-01 04:13:24.415052', 'step': 1782, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:24.446892', 'step': 1782, 'epoch': 2} {'type': 'loss', 'content': 0.00037074877764098346, 'timestamp': '2025-10-01 04:13:24.449025', 'step': 1783, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.480018', 'step': 1783, 'epoch': 2} {'type': 'loss', 'content': 0.0007882600766606629, 'timestamp': '2025-10-01 04:13:24.503958', 'step': 1784, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.534667', 'step': 1784, 'epoch': 2} {'type': 'loss', 'content': 0.0005073948414064944, 'timestamp': '2025-10-01 04:13:24.537978', 'step': 1785, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:24.570750', 'step': 1785, 'epoch': 2} {'type': 'loss', 'content': 0.0011808349518105388, 'timestamp': '2025-10-01 04:13:24.574587', 'step': 1786, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.605971', 'step': 1786, 'epoch': 2} {'type': 'loss', 'content': 0.015473189763724804, 'timestamp': '2025-10-01 04:13:24.608161', 'step': 1787, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.639638', 'step': 1787, 'epoch': 2} {'type': 'loss', 'content': 0.01565859653055668, 'timestamp': '2025-10-01 04:13:24.664111', 'step': 1788, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.695807', 'step': 1788, 'epoch': 2} {'type': 'loss', 'content': 0.000664880673866719, 'timestamp': '2025-10-01 04:13:24.698227', 'step': 1789, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.729936', 'step': 1789, 'epoch': 2} {'type': 'loss', 'content': 0.0029478694777935743, 'timestamp': '2025-10-01 04:13:24.732024', 'step': 1790, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.762747', 'step': 1790, 'epoch': 2} {'type': 'loss', 'content': 0.0037737940438091755, 'timestamp': '2025-10-01 04:13:24.765005', 'step': 1791, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.795681', 'step': 1791, 'epoch': 2} {'type': 'loss', 'content': 0.006180554162710905, 'timestamp': '2025-10-01 04:13:24.820653', 'step': 1792, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.851306', 'step': 1792, 'epoch': 2} {'type': 'loss', 'content': 0.002698136493563652, 'timestamp': '2025-10-01 04:13:24.854328', 'step': 1793, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:24.886677', 'step': 1793, 'epoch': 2} {'type': 'loss', 'content': 0.0008915706421248615, 'timestamp': '2025-10-01 04:13:24.889168', 'step': 1794, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.921713', 'step': 1794, 'epoch': 2} {'type': 'loss', 'content': 0.0012432237854227424, 'timestamp': '2025-10-01 04:13:24.923776', 'step': 1795, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:24.954273', 'step': 1795, 'epoch': 2} {'type': 'loss', 'content': 0.0019791352096945047, 'timestamp': '2025-10-01 04:13:24.978167', 'step': 1796, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:25.009990', 'step': 1796, 'epoch': 2} {'type': 'loss', 'content': 0.018590480089187622, 'timestamp': '2025-10-01 04:13:25.012037', 'step': 1797, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.043127', 'step': 1797, 'epoch': 2} {'type': 'loss', 'content': 0.037092529237270355, 'timestamp': '2025-10-01 04:13:25.045790', 'step': 1798, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.077599', 'step': 1798, 'epoch': 2} {'type': 'loss', 'content': 0.02361510694026947, 'timestamp': '2025-10-01 04:13:25.079796', 'step': 1799, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.110996', 'step': 1799, 'epoch': 2} {'type': 'loss', 'content': 0.030858786776661873, 'timestamp': '2025-10-01 04:13:25.135008', 'step': 1800, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:25.165488', 'step': 1800, 'epoch': 2} {'type': 'loss', 'content': 0.0010235338704660535, 'timestamp': '2025-10-01 04:13:25.167691', 'step': 1801, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.198262', 'step': 1801, 'epoch': 2} {'type': 'loss', 'content': 0.015298736281692982, 'timestamp': '2025-10-01 04:13:25.200240', 'step': 1802, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.232740', 'step': 1802, 'epoch': 2} {'type': 'loss', 'content': 0.001420345390215516, 'timestamp': '2025-10-01 04:13:25.234921', 'step': 1803, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:25.265541', 'step': 1803, 'epoch': 2} {'type': 'loss', 'content': 0.0029548178426921368, 'timestamp': '2025-10-01 04:13:25.289429', 'step': 1804, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.320826', 'step': 1804, 'epoch': 2} {'type': 'loss', 'content': 0.01599166728556156, 'timestamp': '2025-10-01 04:13:25.323047', 'step': 1805, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.354007', 'step': 1805, 'epoch': 2} {'type': 'loss', 'content': 0.0019125614780932665, 'timestamp': '2025-10-01 04:13:25.356149', 'step': 1806, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.387220', 'step': 1806, 'epoch': 2} {'type': 'loss', 'content': 0.022777797654271126, 'timestamp': '2025-10-01 04:13:25.389367', 'step': 1807, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.424041', 'step': 1807, 'epoch': 2} {'type': 'loss', 'content': 0.004297844599932432, 'timestamp': '2025-10-01 04:13:25.447850', 'step': 1808, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:25.479354', 'step': 1808, 'epoch': 2} {'type': 'loss', 'content': 0.0009682009113021195, 'timestamp': '2025-10-01 04:13:25.481785', 'step': 1809, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:25.512359', 'step': 1809, 'epoch': 2} {'type': 'loss', 'content': 0.0078015453182160854, 'timestamp': '2025-10-01 04:13:25.514773', 'step': 1810, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.545490', 'step': 1810, 'epoch': 2} {'type': 'loss', 'content': 0.008723942562937737, 'timestamp': '2025-10-01 04:13:25.547920', 'step': 1811, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.578423', 'step': 1811, 'epoch': 2} {'type': 'loss', 'content': 0.005301562137901783, 'timestamp': '2025-10-01 04:13:25.602838', 'step': 1812, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.633262', 'step': 1812, 'epoch': 2} {'type': 'loss', 'content': 0.002418793737888336, 'timestamp': '2025-10-01 04:13:25.635792', 'step': 1813, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.666735', 'step': 1813, 'epoch': 2} {'type': 'loss', 'content': 0.0029904316179454327, 'timestamp': '2025-10-01 04:13:25.669067', 'step': 1814, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:25.699870', 'step': 1814, 'epoch': 2} {'type': 'loss', 'content': 0.006601743865758181, 'timestamp': '2025-10-01 04:13:25.702131', 'step': 1815, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:25.733411', 'step': 1815, 'epoch': 2} {'type': 'loss', 'content': 0.0020187620539218187, 'timestamp': '2025-10-01 04:13:25.757399', 'step': 1816, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:25.788661', 'step': 1816, 'epoch': 2} {'type': 'loss', 'content': 0.0014762390637770295, 'timestamp': '2025-10-01 04:13:25.790650', 'step': 1817, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.821422', 'step': 1817, 'epoch': 2} {'type': 'loss', 'content': 0.035270411521196365, 'timestamp': '2025-10-01 04:13:25.823667', 'step': 1818, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.855237', 'step': 1818, 'epoch': 2} {'type': 'loss', 'content': 0.02132105454802513, 'timestamp': '2025-10-01 04:13:25.858275', 'step': 1819, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.889586', 'step': 1819, 'epoch': 2} {'type': 'loss', 'content': 0.0026816707104444504, 'timestamp': '2025-10-01 04:13:25.913438', 'step': 1820, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.944648', 'step': 1820, 'epoch': 2} {'type': 'loss', 'content': 0.007399698253720999, 'timestamp': '2025-10-01 04:13:25.946626', 'step': 1821, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:25.977468', 'step': 1821, 'epoch': 2} {'type': 'loss', 'content': 0.0009930060477927327, 'timestamp': '2025-10-01 04:13:25.980770', 'step': 1822, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:26.012561', 'step': 1822, 'epoch': 2} {'type': 'loss', 'content': 0.0034744550939649343, 'timestamp': '2025-10-01 04:13:26.015065', 'step': 1823, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:26.046539', 'step': 1823, 'epoch': 2} {'type': 'loss', 'content': 0.008859805762767792, 'timestamp': '2025-10-01 04:13:26.073275', 'step': 1824, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:13:26.888108', 'step': 1824, 'epoch': 2} {'type': 'pplx', 'content': 74796042.91379905, 'timestamp': '2025-10-01 04:13:26.890423', 'step': 1824, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:26.921508', 'step': 1824, 'epoch': 2} {'type': 'loss', 'content': 0.003556231502443552, 'timestamp': '2025-10-01 04:13:26.925792', 'step': 1825, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:26.960528', 'step': 1825, 'epoch': 2} {'type': 'loss', 'content': 0.004978193901479244, 'timestamp': '2025-10-01 04:13:26.963397', 'step': 1826, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:26.996917', 'step': 1826, 'epoch': 2} {'type': 'loss', 'content': 0.0016510151326656342, 'timestamp': '2025-10-01 04:13:26.999777', 'step': 1827, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:27.033159', 'step': 1827, 'epoch': 2} {'type': 'loss', 'content': 0.007656362373381853, 'timestamp': '2025-10-01 04:13:27.056743', 'step': 1828, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:27.088814', 'step': 1828, 'epoch': 2} {'type': 'loss', 'content': 0.0006503559998236597, 'timestamp': '2025-10-01 04:13:27.093304', 'step': 1829, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.125807', 'step': 1829, 'epoch': 2} {'type': 'loss', 'content': 0.03609628230333328, 'timestamp': '2025-10-01 04:13:27.129373', 'step': 1830, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:27.161355', 'step': 1830, 'epoch': 2} {'type': 'loss', 'content': 0.00043880558223463595, 'timestamp': '2025-10-01 04:13:27.164355', 'step': 1831, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.197251', 'step': 1831, 'epoch': 2} {'type': 'loss', 'content': 0.003280007978901267, 'timestamp': '2025-10-01 04:13:27.221554', 'step': 1832, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.256437', 'step': 1832, 'epoch': 2} {'type': 'loss', 'content': 0.002217098604887724, 'timestamp': '2025-10-01 04:13:27.259061', 'step': 1833, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:27.292756', 'step': 1833, 'epoch': 2} {'type': 'loss', 'content': 0.0035265518818050623, 'timestamp': '2025-10-01 04:13:27.295747', 'step': 1834, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:27.347395', 'step': 1834, 'epoch': 3} {'type': 'loss', 'content': 0.03178298473358154, 'timestamp': '2025-10-01 04:13:27.350825', 'step': 1835, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.384450', 'step': 1835, 'epoch': 3} {'type': 'loss', 'content': 0.023568039759993553, 'timestamp': '2025-10-01 04:13:27.409341', 'step': 1836, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.445995', 'step': 1836, 'epoch': 3} {'type': 'loss', 'content': 0.005751292686909437, 'timestamp': '2025-10-01 04:13:27.449020', 'step': 1837, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.482748', 'step': 1837, 'epoch': 3} {'type': 'loss', 'content': 0.0012173604918643832, 'timestamp': '2025-10-01 04:13:27.485326', 'step': 1838, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.518569', 'step': 1838, 'epoch': 3} {'type': 'loss', 'content': 0.056264422833919525, 'timestamp': '2025-10-01 04:13:27.521494', 'step': 1839, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.554854', 'step': 1839, 'epoch': 3} {'type': 'loss', 'content': 0.018358344212174416, 'timestamp': '2025-10-01 04:13:27.579895', 'step': 1840, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:27.613386', 'step': 1840, 'epoch': 3} {'type': 'loss', 'content': 0.010695835575461388, 'timestamp': '2025-10-01 04:13:27.616027', 'step': 1841, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.650847', 'step': 1841, 'epoch': 3} {'type': 'loss', 'content': 0.0009615811286494136, 'timestamp': '2025-10-01 04:13:27.653408', 'step': 1842, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.686778', 'step': 1842, 'epoch': 3} {'type': 'loss', 'content': 0.0005880445241928101, 'timestamp': '2025-10-01 04:13:27.689243', 'step': 1843, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.722631', 'step': 1843, 'epoch': 3} {'type': 'loss', 'content': 0.034726135432720184, 'timestamp': '2025-10-01 04:13:27.747599', 'step': 1844, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.781394', 'step': 1844, 'epoch': 3} {'type': 'loss', 'content': 0.009501738473773003, 'timestamp': '2025-10-01 04:13:27.784021', 'step': 1845, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.818136', 'step': 1845, 'epoch': 3} {'type': 'loss', 'content': 0.001588527811691165, 'timestamp': '2025-10-01 04:13:27.820910', 'step': 1846, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.855251', 'step': 1846, 'epoch': 3} {'type': 'loss', 'content': 0.010716503486037254, 'timestamp': '2025-10-01 04:13:27.857868', 'step': 1847, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.891326', 'step': 1847, 'epoch': 3} {'type': 'loss', 'content': 0.029663624241948128, 'timestamp': '2025-10-01 04:13:27.915703', 'step': 1848, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.950094', 'step': 1848, 'epoch': 3} {'type': 'loss', 'content': 0.013627013191580772, 'timestamp': '2025-10-01 04:13:27.952950', 'step': 1849, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:27.986386', 'step': 1849, 'epoch': 3} {'type': 'loss', 'content': 0.014335517771542072, 'timestamp': '2025-10-01 04:13:27.989799', 'step': 1850, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.021633', 'step': 1850, 'epoch': 3} {'type': 'loss', 'content': 0.011362132616341114, 'timestamp': '2025-10-01 04:13:28.024810', 'step': 1851, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.058184', 'step': 1851, 'epoch': 3} {'type': 'loss', 'content': 0.004223748110234737, 'timestamp': '2025-10-01 04:13:28.082747', 'step': 1852, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:28.116575', 'step': 1852, 'epoch': 3} {'type': 'loss', 'content': 0.012491399422287941, 'timestamp': '2025-10-01 04:13:28.119353', 'step': 1853, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.151658', 'step': 1853, 'epoch': 3} {'type': 'loss', 'content': 0.013354037888348103, 'timestamp': '2025-10-01 04:13:28.153970', 'step': 1854, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:28.186118', 'step': 1854, 'epoch': 3} {'type': 'loss', 'content': 0.021396491676568985, 'timestamp': '2025-10-01 04:13:28.194063', 'step': 1855, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.225442', 'step': 1855, 'epoch': 3} {'type': 'loss', 'content': 0.05161554366350174, 'timestamp': '2025-10-01 04:13:28.249089', 'step': 1856, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.280062', 'step': 1856, 'epoch': 3} {'type': 'loss', 'content': 0.010022538714110851, 'timestamp': '2025-10-01 04:13:28.281987', 'step': 1857, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.312848', 'step': 1857, 'epoch': 3} {'type': 'loss', 'content': 0.011016866192221642, 'timestamp': '2025-10-01 04:13:28.315012', 'step': 1858, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.345778', 'step': 1858, 'epoch': 3} {'type': 'loss', 'content': 0.006926259491592646, 'timestamp': '2025-10-01 04:13:28.347844', 'step': 1859, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:28.379938', 'step': 1859, 'epoch': 3} {'type': 'loss', 'content': 0.004610821604728699, 'timestamp': '2025-10-01 04:13:28.403784', 'step': 1860, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:28.436043', 'step': 1860, 'epoch': 3} {'type': 'loss', 'content': 0.004365658853203058, 'timestamp': '2025-10-01 04:13:28.438057', 'step': 1861, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.470113', 'step': 1861, 'epoch': 3} {'type': 'loss', 'content': 0.027243614196777344, 'timestamp': '2025-10-01 04:13:28.472186', 'step': 1862, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.504620', 'step': 1862, 'epoch': 3} {'type': 'loss', 'content': 0.0072929286397993565, 'timestamp': '2025-10-01 04:13:28.506635', 'step': 1863, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.539042', 'step': 1863, 'epoch': 3} {'type': 'loss', 'content': 0.012515954673290253, 'timestamp': '2025-10-01 04:13:28.562938', 'step': 1864, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.595735', 'step': 1864, 'epoch': 3} {'type': 'loss', 'content': 0.011179575696587563, 'timestamp': '2025-10-01 04:13:28.597981', 'step': 1865, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.630308', 'step': 1865, 'epoch': 3} {'type': 'loss', 'content': 0.022026440128684044, 'timestamp': '2025-10-01 04:13:28.632844', 'step': 1866, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.665183', 'step': 1866, 'epoch': 3} {'type': 'loss', 'content': 0.008464011363685131, 'timestamp': '2025-10-01 04:13:28.667271', 'step': 1867, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.698879', 'step': 1867, 'epoch': 3} {'type': 'loss', 'content': 0.018374668434262276, 'timestamp': '2025-10-01 04:13:28.722783', 'step': 1868, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.755232', 'step': 1868, 'epoch': 3} {'type': 'loss', 'content': 0.0033123858738690615, 'timestamp': '2025-10-01 04:13:28.757207', 'step': 1869, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.789810', 'step': 1869, 'epoch': 3} {'type': 'loss', 'content': 0.01134712714701891, 'timestamp': '2025-10-01 04:13:28.791793', 'step': 1870, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.823941', 'step': 1870, 'epoch': 3} {'type': 'loss', 'content': 0.015737758949398994, 'timestamp': '2025-10-01 04:13:28.826768', 'step': 1871, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.858581', 'step': 1871, 'epoch': 3} {'type': 'loss', 'content': 0.004202558193355799, 'timestamp': '2025-10-01 04:13:28.882595', 'step': 1872, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:28.914707', 'step': 1872, 'epoch': 3} {'type': 'loss', 'content': 0.005341304000467062, 'timestamp': '2025-10-01 04:13:28.916798', 'step': 1873, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.947280', 'step': 1873, 'epoch': 3} {'type': 'loss', 'content': 0.010225766338407993, 'timestamp': '2025-10-01 04:13:28.949059', 'step': 1874, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:28.980790', 'step': 1874, 'epoch': 3} {'type': 'loss', 'content': 0.007927073165774345, 'timestamp': '2025-10-01 04:13:28.983055', 'step': 1875, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:29.013392', 'step': 1875, 'epoch': 3} {'type': 'loss', 'content': 0.015908638015389442, 'timestamp': '2025-10-01 04:13:29.037108', 'step': 1876, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:29.068314', 'step': 1876, 'epoch': 3} {'type': 'loss', 'content': 0.010531976819038391, 'timestamp': '2025-10-01 04:13:29.070716', 'step': 1877, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:29.101504', 'step': 1877, 'epoch': 3} {'type': 'loss', 'content': 0.0012402069987729192, 'timestamp': '2025-10-01 04:13:29.103558', 'step': 1878, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:29.135058', 'step': 1878, 'epoch': 3} {'type': 'loss', 'content': 0.00214685732498765, 'timestamp': '2025-10-01 04:13:29.137054', 'step': 1879, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:29.167608', 'step': 1879, 'epoch': 3} {'type': 'loss', 'content': 0.0022061713971197605, 'timestamp': '2025-10-01 04:13:29.191987', 'step': 1880, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:29.223633', 'step': 1880, 'epoch': 3} {'type': 'loss', 'content': 0.004665750078856945, 'timestamp': '2025-10-01 04:13:29.225559', 'step': 1881, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:13:29.954864', 'step': 1881, 'epoch': 3} {'type': 'pplx', 'content': 65838218.305152945, 'timestamp': '2025-10-01 04:13:29.956724', 'step': 1881, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:29.986217', 'step': 1881, 'epoch': 3} {'type': 'loss', 'content': 0.0013645051512867212, 'timestamp': '2025-10-01 04:13:29.988092', 'step': 1882, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.018620', 'step': 1882, 'epoch': 3} {'type': 'loss', 'content': 0.001059826696291566, 'timestamp': '2025-10-01 04:13:30.020716', 'step': 1883, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:30.053436', 'step': 1883, 'epoch': 3} {'type': 'loss', 'content': 0.0016396061982959509, 'timestamp': '2025-10-01 04:13:30.077176', 'step': 1884, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.108481', 'step': 1884, 'epoch': 3} {'type': 'loss', 'content': 0.0006029635551385581, 'timestamp': '2025-10-01 04:13:30.110611', 'step': 1885, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.143587', 'step': 1885, 'epoch': 3} {'type': 'loss', 'content': 0.03112313151359558, 'timestamp': '2025-10-01 04:13:30.145822', 'step': 1886, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.177911', 'step': 1886, 'epoch': 3} {'type': 'loss', 'content': 0.02117547206580639, 'timestamp': '2025-10-01 04:13:30.180583', 'step': 1887, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:30.213919', 'step': 1887, 'epoch': 3} {'type': 'loss', 'content': 0.005383083131164312, 'timestamp': '2025-10-01 04:13:30.237897', 'step': 1888, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:30.273917', 'step': 1888, 'epoch': 3} {'type': 'loss', 'content': 0.011658556759357452, 'timestamp': '2025-10-01 04:13:30.276449', 'step': 1889, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:30.309107', 'step': 1889, 'epoch': 3} {'type': 'loss', 'content': 0.0026283669285476208, 'timestamp': '2025-10-01 04:13:30.311166', 'step': 1890, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.343471', 'step': 1890, 'epoch': 3} {'type': 'loss', 'content': 0.0016616786597296596, 'timestamp': '2025-10-01 04:13:30.345473', 'step': 1891, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.376082', 'step': 1891, 'epoch': 3} {'type': 'loss', 'content': 0.006170527543872595, 'timestamp': '2025-10-01 04:13:30.399812', 'step': 1892, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.430571', 'step': 1892, 'epoch': 3} {'type': 'loss', 'content': 0.011967835947871208, 'timestamp': '2025-10-01 04:13:30.432581', 'step': 1893, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.462710', 'step': 1893, 'epoch': 3} {'type': 'loss', 'content': 0.03743167966604233, 'timestamp': '2025-10-01 04:13:30.464893', 'step': 1894, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.496021', 'step': 1894, 'epoch': 3} {'type': 'loss', 'content': 0.0017973927315324545, 'timestamp': '2025-10-01 04:13:30.498004', 'step': 1895, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:30.527694', 'step': 1895, 'epoch': 3} {'type': 'loss', 'content': 0.0042338259518146515, 'timestamp': '2025-10-01 04:13:30.551287', 'step': 1896, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:30.581793', 'step': 1896, 'epoch': 3} {'type': 'loss', 'content': 0.0005765099194832146, 'timestamp': '2025-10-01 04:13:30.583841', 'step': 1897, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.614291', 'step': 1897, 'epoch': 3} {'type': 'loss', 'content': 0.0013105726102367043, 'timestamp': '2025-10-01 04:13:30.616201', 'step': 1898, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.650389', 'step': 1898, 'epoch': 3} {'type': 'loss', 'content': 0.0006543918279930949, 'timestamp': '2025-10-01 04:13:30.652414', 'step': 1899, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:30.682310', 'step': 1899, 'epoch': 3} {'type': 'loss', 'content': 0.040685024112463, 'timestamp': '2025-10-01 04:13:30.705820', 'step': 1900, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.736117', 'step': 1900, 'epoch': 3} {'type': 'loss', 'content': 0.0019543597009032965, 'timestamp': '2025-10-01 04:13:30.741473', 'step': 1901, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.772803', 'step': 1901, 'epoch': 3} {'type': 'loss', 'content': 0.014719049446284771, 'timestamp': '2025-10-01 04:13:30.775507', 'step': 1902, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:30.805883', 'step': 1902, 'epoch': 3} {'type': 'loss', 'content': 0.014800036326050758, 'timestamp': '2025-10-01 04:13:30.808051', 'step': 1903, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.838300', 'step': 1903, 'epoch': 3} {'type': 'loss', 'content': 0.03836487978696823, 'timestamp': '2025-10-01 04:13:30.866424', 'step': 1904, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.897353', 'step': 1904, 'epoch': 3} {'type': 'loss', 'content': 0.0014941886765882373, 'timestamp': '2025-10-01 04:13:30.904541', 'step': 1905, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:30.935950', 'step': 1905, 'epoch': 3} {'type': 'loss', 'content': 0.03499216586351395, 'timestamp': '2025-10-01 04:13:30.938392', 'step': 1906, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:30.968920', 'step': 1906, 'epoch': 3} {'type': 'loss', 'content': 0.0013538615312427282, 'timestamp': '2025-10-01 04:13:30.974435', 'step': 1907, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.011062', 'step': 1907, 'epoch': 3} {'type': 'loss', 'content': 0.02370210364460945, 'timestamp': '2025-10-01 04:13:31.036779', 'step': 1908, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.071927', 'step': 1908, 'epoch': 3} {'type': 'loss', 'content': 0.014159292913973331, 'timestamp': '2025-10-01 04:13:31.074016', 'step': 1909, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:31.104665', 'step': 1909, 'epoch': 3} {'type': 'loss', 'content': 0.01698492281138897, 'timestamp': '2025-10-01 04:13:31.107265', 'step': 1910, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.138415', 'step': 1910, 'epoch': 3} {'type': 'loss', 'content': 0.013147769495844841, 'timestamp': '2025-10-01 04:13:31.140256', 'step': 1911, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.170541', 'step': 1911, 'epoch': 3} {'type': 'loss', 'content': 0.0037520453333854675, 'timestamp': '2025-10-01 04:13:31.195782', 'step': 1912, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.229118', 'step': 1912, 'epoch': 3} {'type': 'loss', 'content': 0.015137665905058384, 'timestamp': '2025-10-01 04:13:31.231257', 'step': 1913, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:31.261872', 'step': 1913, 'epoch': 3} {'type': 'loss', 'content': 0.00783214159309864, 'timestamp': '2025-10-01 04:13:31.264099', 'step': 1914, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.294172', 'step': 1914, 'epoch': 3} {'type': 'loss', 'content': 0.02275925502181053, 'timestamp': '2025-10-01 04:13:31.296275', 'step': 1915, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.326140', 'step': 1915, 'epoch': 3} {'type': 'loss', 'content': 0.012978742830455303, 'timestamp': '2025-10-01 04:13:31.349911', 'step': 1916, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.380199', 'step': 1916, 'epoch': 3} {'type': 'loss', 'content': 0.0010948796989396214, 'timestamp': '2025-10-01 04:13:31.383428', 'step': 1917, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.413322', 'step': 1917, 'epoch': 3} {'type': 'loss', 'content': 0.021318109706044197, 'timestamp': '2025-10-01 04:13:31.415399', 'step': 1918, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.445402', 'step': 1918, 'epoch': 3} {'type': 'loss', 'content': 0.005678139626979828, 'timestamp': '2025-10-01 04:13:31.447655', 'step': 1919, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.477948', 'step': 1919, 'epoch': 3} {'type': 'loss', 'content': 0.04414077475667, 'timestamp': '2025-10-01 04:13:31.501749', 'step': 1920, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.532667', 'step': 1920, 'epoch': 3} {'type': 'loss', 'content': 0.011896932497620583, 'timestamp': '2025-10-01 04:13:31.534894', 'step': 1921, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.566668', 'step': 1921, 'epoch': 3} {'type': 'loss', 'content': 0.015560947358608246, 'timestamp': '2025-10-01 04:13:31.569044', 'step': 1922, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.599157', 'step': 1922, 'epoch': 3} {'type': 'loss', 'content': 0.04291582852602005, 'timestamp': '2025-10-01 04:13:31.601088', 'step': 1923, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:31.631716', 'step': 1923, 'epoch': 3} {'type': 'loss', 'content': 0.02800329215824604, 'timestamp': '2025-10-01 04:13:31.655283', 'step': 1924, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.686117', 'step': 1924, 'epoch': 3} {'type': 'loss', 'content': 0.030857792124152184, 'timestamp': '2025-10-01 04:13:31.688098', 'step': 1925, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:31.717738', 'step': 1925, 'epoch': 3} {'type': 'loss', 'content': 0.027378613129258156, 'timestamp': '2025-10-01 04:13:31.719815', 'step': 1926, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.751177', 'step': 1926, 'epoch': 3} {'type': 'loss', 'content': 0.0037630724254995584, 'timestamp': '2025-10-01 04:13:31.753563', 'step': 1927, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.784129', 'step': 1927, 'epoch': 3} {'type': 'loss', 'content': 0.009359322488307953, 'timestamp': '2025-10-01 04:13:31.809218', 'step': 1928, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.839805', 'step': 1928, 'epoch': 3} {'type': 'loss', 'content': 0.021776987239718437, 'timestamp': '2025-10-01 04:13:31.841711', 'step': 1929, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.871655', 'step': 1929, 'epoch': 3} {'type': 'loss', 'content': 0.04720321670174599, 'timestamp': '2025-10-01 04:13:31.873744', 'step': 1930, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.903552', 'step': 1930, 'epoch': 3} {'type': 'loss', 'content': 0.010065094567835331, 'timestamp': '2025-10-01 04:13:31.905664', 'step': 1931, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.935470', 'step': 1931, 'epoch': 3} {'type': 'loss', 'content': 0.005961592774838209, 'timestamp': '2025-10-01 04:13:31.959241', 'step': 1932, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:31.989950', 'step': 1932, 'epoch': 3} {'type': 'loss', 'content': 0.0008259662427008152, 'timestamp': '2025-10-01 04:13:31.992442', 'step': 1933, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:32.022090', 'step': 1933, 'epoch': 3} {'type': 'loss', 'content': 0.008287528529763222, 'timestamp': '2025-10-01 04:13:32.024453', 'step': 1934, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:32.055265', 'step': 1934, 'epoch': 3} {'type': 'loss', 'content': 0.002223608084022999, 'timestamp': '2025-10-01 04:13:32.057659', 'step': 1935, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:32.087501', 'step': 1935, 'epoch': 3} {'type': 'loss', 'content': 0.0037765316665172577, 'timestamp': '2025-10-01 04:13:32.111706', 'step': 1936, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:32.142388', 'step': 1936, 'epoch': 3} {'type': 'loss', 'content': 0.010634335689246655, 'timestamp': '2025-10-01 04:13:32.144384', 'step': 1937, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:32.175259', 'step': 1937, 'epoch': 3} {'type': 'loss', 'content': 0.008796789683401585, 'timestamp': '2025-10-01 04:13:32.177824', 'step': 1938, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:13:32.902030', 'step': 1938, 'epoch': 3} {'type': 'pplx', 'content': 59471434.40554708, 'timestamp': '2025-10-01 04:13:32.903935', 'step': 1938, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:32.933279', 'step': 1938, 'epoch': 3} {'type': 'loss', 'content': 0.04188920930027962, 'timestamp': '2025-10-01 04:13:32.935486', 'step': 1939, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:32.965804', 'step': 1939, 'epoch': 3} {'type': 'loss', 'content': 0.001124311238527298, 'timestamp': '2025-10-01 04:13:32.989586', 'step': 1940, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:33.019772', 'step': 1940, 'epoch': 3} {'type': 'loss', 'content': 0.004740908741950989, 'timestamp': '2025-10-01 04:13:33.021948', 'step': 1941, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:33.053776', 'step': 1941, 'epoch': 3} {'type': 'loss', 'content': 0.015581603161990643, 'timestamp': '2025-10-01 04:13:33.056248', 'step': 1942, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:33.085781', 'step': 1942, 'epoch': 3} {'type': 'loss', 'content': 0.05138517543673515, 'timestamp': '2025-10-01 04:13:33.088154', 'step': 1943, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:13:33.119782', 'step': 1943, 'epoch': 3} {'type': 'loss', 'content': 0.05064978078007698, 'timestamp': '2025-10-01 04:13:33.143991', 'step': 1944, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:33.175911', 'step': 1944, 'epoch': 3} {'type': 'loss', 'content': 0.002924917731434107, 'timestamp': '2025-10-01 04:13:33.178647', 'step': 1945, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:33.209408', 'step': 1945, 'epoch': 3} {'type': 'loss', 'content': 0.008786286227405071, 'timestamp': '2025-10-01 04:13:33.211405', 'step': 1946, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:33.241561', 'step': 1946, 'epoch': 3} {'type': 'loss', 'content': 0.004422423895448446, 'timestamp': '2025-10-01 04:13:33.243950', 'step': 1947, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:33.274674', 'step': 1947, 'epoch': 3} {'type': 'loss', 'content': 0.004494368098676205, 'timestamp': '2025-10-01 04:13:33.298784', 'step': 1948, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:33.329977', 'step': 1948, 'epoch': 3} {'type': 'loss', 'content': 0.002485204953700304, 'timestamp': '2025-10-01 04:13:33.332296', 'step': 1949, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:33.362428', 'step': 1949, 'epoch': 3} {'type': 'loss', 'content': 0.018244251608848572, 'timestamp': '2025-10-01 04:13:33.364472', 'step': 1950, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:33.396877', 'step': 1950, 'epoch': 3} {'type': 'loss', 'content': 0.012204146943986416, 'timestamp': '2025-10-01 04:13:33.399354', 'step': 1951, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:33.430007', 'step': 1951, 'epoch': 3} {'type': 'loss', 'content': 0.017980916425585747, 'timestamp': '2025-10-01 04:13:33.454051', 'step': 1952, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:33.486631', 'step': 1952, 'epoch': 3} {'type': 'loss', 'content': 0.0020012136083096266, 'timestamp': '2025-10-01 04:13:33.488563', 'step': 1953, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:33.519185', 'step': 1953, 'epoch': 3} {'type': 'loss', 'content': 0.0037194720935076475, 'timestamp': '2025-10-01 04:13:33.521326', 'step': 1954, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:33.554933', 'step': 1954, 'epoch': 3} {'type': 'loss', 'content': 0.03667493537068367, 'timestamp': '2025-10-01 04:13:33.557126', 'step': 1955, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:33.589409', 'step': 1955, 'epoch': 3} {'type': 'loss', 'content': 0.02140049636363983, 'timestamp': '2025-10-01 04:13:33.614408', 'step': 1956, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:33.645953', 'step': 1956, 'epoch': 3} {'type': 'loss', 'content': 0.015395239926874638, 'timestamp': '2025-10-01 04:13:33.648748', 'step': 1957, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:33.679684', 'step': 1957, 'epoch': 3} {'type': 'loss', 'content': 0.0056080566719174385, 'timestamp': '2025-10-01 04:13:33.681994', 'step': 1958, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:33.714060', 'step': 1958, 'epoch': 3} {'type': 'loss', 'content': 0.004278783220797777, 'timestamp': '2025-10-01 04:13:33.716110', 'step': 1959, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:33.747166', 'step': 1959, 'epoch': 3} {'type': 'loss', 'content': 0.031007369980216026, 'timestamp': '2025-10-01 04:13:33.770944', 'step': 1960, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:33.802364', 'step': 1960, 'epoch': 3} {'type': 'loss', 'content': 0.004366572946310043, 'timestamp': '2025-10-01 04:13:33.804460', 'step': 1961, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:33.835489', 'step': 1961, 'epoch': 3} {'type': 'loss', 'content': 0.003953932784497738, 'timestamp': '2025-10-01 04:13:33.837553', 'step': 1962, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:33.869313', 'step': 1962, 'epoch': 3} {'type': 'loss', 'content': 0.020310308784246445, 'timestamp': '2025-10-01 04:13:33.871664', 'step': 1963, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:33.901955', 'step': 1963, 'epoch': 3} {'type': 'loss', 'content': 0.040750809013843536, 'timestamp': '2025-10-01 04:13:33.925390', 'step': 1964, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:33.955820', 'step': 1964, 'epoch': 3} {'type': 'loss', 'content': 0.005061944015324116, 'timestamp': '2025-10-01 04:13:33.957668', 'step': 1965, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:33.988940', 'step': 1965, 'epoch': 3} {'type': 'loss', 'content': 0.0036905964370816946, 'timestamp': '2025-10-01 04:13:33.991360', 'step': 1966, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:34.021359', 'step': 1966, 'epoch': 3} {'type': 'loss', 'content': 0.005855924915522337, 'timestamp': '2025-10-01 04:13:34.023692', 'step': 1967, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:34.053658', 'step': 1967, 'epoch': 3} {'type': 'loss', 'content': 0.02116120420396328, 'timestamp': '2025-10-01 04:13:34.077414', 'step': 1968, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:34.108331', 'step': 1968, 'epoch': 3} {'type': 'loss', 'content': 0.002757231006398797, 'timestamp': '2025-10-01 04:13:34.110196', 'step': 1969, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:34.141899', 'step': 1969, 'epoch': 3} {'type': 'loss', 'content': 0.011400270275771618, 'timestamp': '2025-10-01 04:13:34.144151', 'step': 1970, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:34.174519', 'step': 1970, 'epoch': 3} {'type': 'loss', 'content': 0.011773461475968361, 'timestamp': '2025-10-01 04:13:34.176907', 'step': 1971, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:34.207821', 'step': 1971, 'epoch': 3} {'type': 'loss', 'content': 0.02493475005030632, 'timestamp': '2025-10-01 04:13:34.231446', 'step': 1972, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:34.261596', 'step': 1972, 'epoch': 3} {'type': 'loss', 'content': 0.021315688267350197, 'timestamp': '2025-10-01 04:13:34.263901', 'step': 1973, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:34.294275', 'step': 1973, 'epoch': 3} {'type': 'loss', 'content': 0.008046154864132404, 'timestamp': '2025-10-01 04:13:34.296656', 'step': 1974, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:34.326601', 'step': 1974, 'epoch': 3} {'type': 'loss', 'content': 0.024015789851546288, 'timestamp': '2025-10-01 04:13:34.328973', 'step': 1975, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:34.360233', 'step': 1975, 'epoch': 3} {'type': 'loss', 'content': 0.025987252593040466, 'timestamp': '2025-10-01 04:13:34.383785', 'step': 1976, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:34.415222', 'step': 1976, 'epoch': 3} {'type': 'loss', 'content': 0.008847145363688469, 'timestamp': '2025-10-01 04:13:34.417304', 'step': 1977, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:34.447961', 'step': 1977, 'epoch': 3} {'type': 'loss', 'content': 0.01479099690914154, 'timestamp': '2025-10-01 04:13:34.450307', 'step': 1978, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:34.480525', 'step': 1978, 'epoch': 3} {'type': 'loss', 'content': 0.010528464801609516, 'timestamp': '2025-10-01 04:13:34.482802', 'step': 1979, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:34.512384', 'step': 1979, 'epoch': 3} {'type': 'loss', 'content': 0.0039046481251716614, 'timestamp': '2025-10-01 04:13:34.536097', 'step': 1980, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:34.567508', 'step': 1980, 'epoch': 3} {'type': 'loss', 'content': 0.004746978636831045, 'timestamp': '2025-10-01 04:13:34.569515', 'step': 1981, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:34.598932', 'step': 1981, 'epoch': 3} {'type': 'loss', 'content': 0.011390685103833675, 'timestamp': '2025-10-01 04:13:34.601337', 'step': 1982, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:34.631583', 'step': 1982, 'epoch': 3} {'type': 'loss', 'content': 0.009219239465892315, 'timestamp': '2025-10-01 04:13:34.633942', 'step': 1983, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:34.664092', 'step': 1983, 'epoch': 3} {'type': 'loss', 'content': 0.006982100661844015, 'timestamp': '2025-10-01 04:13:34.687609', 'step': 1984, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:34.718286', 'step': 1984, 'epoch': 3} {'type': 'loss', 'content': 0.0032235754188150167, 'timestamp': '2025-10-01 04:13:34.720387', 'step': 1985, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:34.750519', 'step': 1985, 'epoch': 3} {'type': 'loss', 'content': 0.001993803773075342, 'timestamp': '2025-10-01 04:13:34.752547', 'step': 1986, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:34.783871', 'step': 1986, 'epoch': 3} {'type': 'loss', 'content': 0.018395518884062767, 'timestamp': '2025-10-01 04:13:34.785819', 'step': 1987, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:34.816023', 'step': 1987, 'epoch': 3} {'type': 'loss', 'content': 0.01245657354593277, 'timestamp': '2025-10-01 04:13:34.839668', 'step': 1988, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:34.870737', 'step': 1988, 'epoch': 3} {'type': 'loss', 'content': 0.014887102879583836, 'timestamp': '2025-10-01 04:13:34.872680', 'step': 1989, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:34.902322', 'step': 1989, 'epoch': 3} {'type': 'loss', 'content': 0.0028198598884046078, 'timestamp': '2025-10-01 04:13:34.904112', 'step': 1990, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:34.934024', 'step': 1990, 'epoch': 3} {'type': 'loss', 'content': 0.04544464498758316, 'timestamp': '2025-10-01 04:13:34.936110', 'step': 1991, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:34.965363', 'step': 1991, 'epoch': 3} {'type': 'loss', 'content': 0.0295643862336874, 'timestamp': '2025-10-01 04:13:34.988948', 'step': 1992, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:35.018979', 'step': 1992, 'epoch': 3} {'type': 'loss', 'content': 0.05637253448367119, 'timestamp': '2025-10-01 04:13:35.021179', 'step': 1993, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:35.050950', 'step': 1993, 'epoch': 3} {'type': 'loss', 'content': 0.015412045642733574, 'timestamp': '2025-10-01 04:13:35.052952', 'step': 1994, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:35.083055', 'step': 1994, 'epoch': 3} {'type': 'loss', 'content': 0.0036210103426128626, 'timestamp': '2025-10-01 04:13:35.085376', 'step': 1995, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:13:35.884810', 'step': 1995, 'epoch': 3} {'type': 'pplx', 'content': 45753030.53963223, 'timestamp': '2025-10-01 04:13:35.886915', 'step': 1995, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:35.917191', 'step': 1995, 'epoch': 3} {'type': 'loss', 'content': 0.005879095755517483, 'timestamp': '2025-10-01 04:13:35.941385', 'step': 1996, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:35.973474', 'step': 1996, 'epoch': 3} {'type': 'loss', 'content': 0.005743040703237057, 'timestamp': '2025-10-01 04:13:35.976332', 'step': 1997, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:36.009613', 'step': 1997, 'epoch': 3} {'type': 'loss', 'content': 0.016395164653658867, 'timestamp': '2025-10-01 04:13:36.011895', 'step': 1998, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:36.043705', 'step': 1998, 'epoch': 3} {'type': 'loss', 'content': 0.02659083716571331, 'timestamp': '2025-10-01 04:13:36.046226', 'step': 1999, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:36.078620', 'step': 1999, 'epoch': 3} {'type': 'loss', 'content': 0.0178259015083313, 'timestamp': '2025-10-01 04:13:36.102703', 'step': 2000, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2000', 'timestamp': '2025-10-01 04:13:41.044832', 'step': 2000, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:41.084808', 'step': 2000, 'epoch': 3} {'type': 'loss', 'content': 0.00914577953517437, 'timestamp': '2025-10-01 04:13:41.087024', 'step': 2001, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:41.117081', 'step': 2001, 'epoch': 3} {'type': 'loss', 'content': 0.0025305438321083784, 'timestamp': '2025-10-01 04:13:41.118808', 'step': 2002, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:41.148903', 'step': 2002, 'epoch': 3} {'type': 'loss', 'content': 0.0016962222289294004, 'timestamp': '2025-10-01 04:13:41.150818', 'step': 2003, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:41.179960', 'step': 2003, 'epoch': 3} {'type': 'loss', 'content': 0.038361646234989166, 'timestamp': '2025-10-01 04:13:41.203542', 'step': 2004, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:41.236584', 'step': 2004, 'epoch': 3} {'type': 'loss', 'content': 0.0017910029273480177, 'timestamp': '2025-10-01 04:13:41.238498', 'step': 2005, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:41.267895', 'step': 2005, 'epoch': 3} {'type': 'loss', 'content': 0.029518509283661842, 'timestamp': '2025-10-01 04:13:41.269941', 'step': 2006, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:41.299241', 'step': 2006, 'epoch': 3} {'type': 'loss', 'content': 0.008002429269254208, 'timestamp': '2025-10-01 04:13:41.301296', 'step': 2007, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:41.331410', 'step': 2007, 'epoch': 3} {'type': 'loss', 'content': 0.020128855481743813, 'timestamp': '2025-10-01 04:13:41.354945', 'step': 2008, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:41.385301', 'step': 2008, 'epoch': 3} {'type': 'loss', 'content': 0.00392829580232501, 'timestamp': '2025-10-01 04:13:41.387203', 'step': 2009, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:41.416637', 'step': 2009, 'epoch': 3} {'type': 'loss', 'content': 0.02912677265703678, 'timestamp': '2025-10-01 04:13:41.418659', 'step': 2010, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:41.449061', 'step': 2010, 'epoch': 3} {'type': 'loss', 'content': 0.00468469737097621, 'timestamp': '2025-10-01 04:13:41.451445', 'step': 2011, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:41.481111', 'step': 2011, 'epoch': 3} {'type': 'loss', 'content': 0.00664009153842926, 'timestamp': '2025-10-01 04:13:41.504833', 'step': 2012, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:41.534186', 'step': 2012, 'epoch': 3} {'type': 'loss', 'content': 0.002807890996336937, 'timestamp': '2025-10-01 04:13:41.536205', 'step': 2013, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:41.565962', 'step': 2013, 'epoch': 3} {'type': 'loss', 'content': 0.004763583652675152, 'timestamp': '2025-10-01 04:13:41.568519', 'step': 2014, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:41.599712', 'step': 2014, 'epoch': 3} {'type': 'loss', 'content': 0.022498924285173416, 'timestamp': '2025-10-01 04:13:41.601428', 'step': 2015, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:41.631711', 'step': 2015, 'epoch': 3} {'type': 'loss', 'content': 0.01837407425045967, 'timestamp': '2025-10-01 04:13:41.655443', 'step': 2016, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:41.685764', 'step': 2016, 'epoch': 3} {'type': 'loss', 'content': 0.007270206697285175, 'timestamp': '2025-10-01 04:13:41.688059', 'step': 2017, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:41.719301', 'step': 2017, 'epoch': 3} {'type': 'loss', 'content': 0.012231186963617802, 'timestamp': '2025-10-01 04:13:41.721837', 'step': 2018, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:41.752069', 'step': 2018, 'epoch': 3} {'type': 'loss', 'content': 0.003825768129900098, 'timestamp': '2025-10-01 04:13:41.754243', 'step': 2019, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:41.784997', 'step': 2019, 'epoch': 3} {'type': 'loss', 'content': 0.0063703921623528, 'timestamp': '2025-10-01 04:13:41.808597', 'step': 2020, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:41.838744', 'step': 2020, 'epoch': 3} {'type': 'loss', 'content': 0.004631161689758301, 'timestamp': '2025-10-01 04:13:41.843697', 'step': 2021, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:41.880007', 'step': 2021, 'epoch': 3} {'type': 'loss', 'content': 0.005757237318903208, 'timestamp': '2025-10-01 04:13:41.882171', 'step': 2022, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:41.914333', 'step': 2022, 'epoch': 3} {'type': 'loss', 'content': 0.002572003984823823, 'timestamp': '2025-10-01 04:13:41.917093', 'step': 2023, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:41.949578', 'step': 2023, 'epoch': 3} {'type': 'loss', 'content': 0.01088919211179018, 'timestamp': '2025-10-01 04:13:41.973474', 'step': 2024, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.006547', 'step': 2024, 'epoch': 3} {'type': 'loss', 'content': 0.017445290461182594, 'timestamp': '2025-10-01 04:13:42.008041', 'step': 2025, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:42.044072', 'step': 2025, 'epoch': 3} {'type': 'loss', 'content': 0.002059668768197298, 'timestamp': '2025-10-01 04:13:42.046283', 'step': 2026, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:42.078986', 'step': 2026, 'epoch': 3} {'type': 'loss', 'content': 0.004977476317435503, 'timestamp': '2025-10-01 04:13:42.081441', 'step': 2027, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:42.117496', 'step': 2027, 'epoch': 3} {'type': 'loss', 'content': 0.011412362568080425, 'timestamp': '2025-10-01 04:13:42.141905', 'step': 2028, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:42.175179', 'step': 2028, 'epoch': 3} {'type': 'loss', 'content': 0.038777027279138565, 'timestamp': '2025-10-01 04:13:42.177100', 'step': 2029, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.209310', 'step': 2029, 'epoch': 3} {'type': 'loss', 'content': 0.01822769083082676, 'timestamp': '2025-10-01 04:13:42.211295', 'step': 2030, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:42.243929', 'step': 2030, 'epoch': 3} {'type': 'loss', 'content': 0.003951018210500479, 'timestamp': '2025-10-01 04:13:42.246866', 'step': 2031, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:42.277135', 'step': 2031, 'epoch': 3} {'type': 'loss', 'content': 0.010680814273655415, 'timestamp': '2025-10-01 04:13:42.300961', 'step': 2032, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:42.331895', 'step': 2032, 'epoch': 3} {'type': 'loss', 'content': 0.012020791880786419, 'timestamp': '2025-10-01 04:13:42.334238', 'step': 2033, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.364424', 'step': 2033, 'epoch': 3} {'type': 'loss', 'content': 0.014839199371635914, 'timestamp': '2025-10-01 04:13:42.368089', 'step': 2034, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:42.400671', 'step': 2034, 'epoch': 3} {'type': 'loss', 'content': 0.004194910638034344, 'timestamp': '2025-10-01 04:13:42.403322', 'step': 2035, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.433814', 'step': 2035, 'epoch': 3} {'type': 'loss', 'content': 0.020307643339037895, 'timestamp': '2025-10-01 04:13:42.457841', 'step': 2036, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.488434', 'step': 2036, 'epoch': 3} {'type': 'loss', 'content': 0.012298102490603924, 'timestamp': '2025-10-01 04:13:42.490738', 'step': 2037, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:42.520715', 'step': 2037, 'epoch': 3} {'type': 'loss', 'content': 0.023398157209157944, 'timestamp': '2025-10-01 04:13:42.523148', 'step': 2038, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:42.554800', 'step': 2038, 'epoch': 3} {'type': 'loss', 'content': 0.0030963234603405, 'timestamp': '2025-10-01 04:13:42.557395', 'step': 2039, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.587791', 'step': 2039, 'epoch': 3} {'type': 'loss', 'content': 0.0033669748809188604, 'timestamp': '2025-10-01 04:13:42.611517', 'step': 2040, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.641218', 'step': 2040, 'epoch': 3} {'type': 'loss', 'content': 0.010291491635143757, 'timestamp': '2025-10-01 04:13:42.643439', 'step': 2041, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.674332', 'step': 2041, 'epoch': 3} {'type': 'loss', 'content': 0.001421350403688848, 'timestamp': '2025-10-01 04:13:42.676645', 'step': 2042, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.706673', 'step': 2042, 'epoch': 3} {'type': 'loss', 'content': 0.010278047993779182, 'timestamp': '2025-10-01 04:13:42.709041', 'step': 2043, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.738388', 'step': 2043, 'epoch': 3} {'type': 'loss', 'content': 0.015349557623267174, 'timestamp': '2025-10-01 04:13:42.762173', 'step': 2044, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.792206', 'step': 2044, 'epoch': 3} {'type': 'loss', 'content': 0.003486798843368888, 'timestamp': '2025-10-01 04:13:42.794905', 'step': 2045, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.824916', 'step': 2045, 'epoch': 3} {'type': 'loss', 'content': 0.05607367306947708, 'timestamp': '2025-10-01 04:13:42.827353', 'step': 2046, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.857918', 'step': 2046, 'epoch': 3} {'type': 'loss', 'content': 0.011063175275921822, 'timestamp': '2025-10-01 04:13:42.865769', 'step': 2047, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.895703', 'step': 2047, 'epoch': 3} {'type': 'loss', 'content': 0.0091552147641778, 'timestamp': '2025-10-01 04:13:42.919950', 'step': 2048, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.949538', 'step': 2048, 'epoch': 3} {'type': 'loss', 'content': 0.002595100784674287, 'timestamp': '2025-10-01 04:13:42.951759', 'step': 2049, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:42.981956', 'step': 2049, 'epoch': 3} {'type': 'loss', 'content': 0.009335712529718876, 'timestamp': '2025-10-01 04:13:42.984094', 'step': 2050, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:43.015008', 'step': 2050, 'epoch': 3} {'type': 'loss', 'content': 0.0054068006575107574, 'timestamp': '2025-10-01 04:13:43.017493', 'step': 2051, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:43.048016', 'step': 2051, 'epoch': 3} {'type': 'loss', 'content': 0.037687864154577255, 'timestamp': '2025-10-01 04:13:43.072541', 'step': 2052, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:13:43.824169', 'step': 2052, 'epoch': 3} {'type': 'pplx', 'content': 49795955.38740884, 'timestamp': '2025-10-01 04:13:43.825908', 'step': 2052, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:43.854213', 'step': 2052, 'epoch': 3} {'type': 'loss', 'content': 0.003781549632549286, 'timestamp': '2025-10-01 04:13:43.856335', 'step': 2053, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:43.886600', 'step': 2053, 'epoch': 3} {'type': 'loss', 'content': 0.006803255993872881, 'timestamp': '2025-10-01 04:13:43.888530', 'step': 2054, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:43.918473', 'step': 2054, 'epoch': 3} {'type': 'loss', 'content': 0.009120832197368145, 'timestamp': '2025-10-01 04:13:43.921096', 'step': 2055, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:43.950353', 'step': 2055, 'epoch': 3} {'type': 'loss', 'content': 0.002633993746712804, 'timestamp': '2025-10-01 04:13:43.974505', 'step': 2056, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.004236', 'step': 2056, 'epoch': 3} {'type': 'loss', 'content': 0.016486208885908127, 'timestamp': '2025-10-01 04:13:44.006194', 'step': 2057, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.036241', 'step': 2057, 'epoch': 3} {'type': 'loss', 'content': 0.005618637893348932, 'timestamp': '2025-10-01 04:13:44.038098', 'step': 2058, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.068300', 'step': 2058, 'epoch': 3} {'type': 'loss', 'content': 0.007397721987217665, 'timestamp': '2025-10-01 04:13:44.070489', 'step': 2059, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.100564', 'step': 2059, 'epoch': 3} {'type': 'loss', 'content': 0.005749840755015612, 'timestamp': '2025-10-01 04:13:44.124490', 'step': 2060, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.156311', 'step': 2060, 'epoch': 3} {'type': 'loss', 'content': 0.0069750151596963406, 'timestamp': '2025-10-01 04:13:44.159010', 'step': 2061, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:44.190939', 'step': 2061, 'epoch': 3} {'type': 'loss', 'content': 0.001430398435331881, 'timestamp': '2025-10-01 04:13:44.193187', 'step': 2062, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:44.223096', 'step': 2062, 'epoch': 3} {'type': 'loss', 'content': 0.017816442996263504, 'timestamp': '2025-10-01 04:13:44.225709', 'step': 2063, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.257661', 'step': 2063, 'epoch': 3} {'type': 'loss', 'content': 0.010645156726241112, 'timestamp': '2025-10-01 04:13:44.281236', 'step': 2064, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.312333', 'step': 2064, 'epoch': 3} {'type': 'loss', 'content': 0.001670172088779509, 'timestamp': '2025-10-01 04:13:44.314198', 'step': 2065, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.343736', 'step': 2065, 'epoch': 3} {'type': 'loss', 'content': 0.005457701627165079, 'timestamp': '2025-10-01 04:13:44.346086', 'step': 2066, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:44.378394', 'step': 2066, 'epoch': 3} {'type': 'loss', 'content': 0.01062505878508091, 'timestamp': '2025-10-01 04:13:44.380558', 'step': 2067, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.411292', 'step': 2067, 'epoch': 3} {'type': 'loss', 'content': 0.026712706312537193, 'timestamp': '2025-10-01 04:13:44.435464', 'step': 2068, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.466555', 'step': 2068, 'epoch': 3} {'type': 'loss', 'content': 0.0025216001085937023, 'timestamp': '2025-10-01 04:13:44.468527', 'step': 2069, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.498443', 'step': 2069, 'epoch': 3} {'type': 'loss', 'content': 0.01525406539440155, 'timestamp': '2025-10-01 04:13:44.500631', 'step': 2070, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:44.530696', 'step': 2070, 'epoch': 3} {'type': 'loss', 'content': 0.0010936484904959798, 'timestamp': '2025-10-01 04:13:44.532965', 'step': 2071, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.563112', 'step': 2071, 'epoch': 3} {'type': 'loss', 'content': 0.031001046299934387, 'timestamp': '2025-10-01 04:13:44.587137', 'step': 2072, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.618608', 'step': 2072, 'epoch': 3} {'type': 'loss', 'content': 0.009324406273663044, 'timestamp': '2025-10-01 04:13:44.621026', 'step': 2073, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.650867', 'step': 2073, 'epoch': 3} {'type': 'loss', 'content': 0.00281874043866992, 'timestamp': '2025-10-01 04:13:44.653183', 'step': 2074, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:44.685662', 'step': 2074, 'epoch': 3} {'type': 'loss', 'content': 0.0038216691464185715, 'timestamp': '2025-10-01 04:13:44.688020', 'step': 2075, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.721222', 'step': 2075, 'epoch': 3} {'type': 'loss', 'content': 0.0112691018730402, 'timestamp': '2025-10-01 04:13:44.745417', 'step': 2076, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.780566', 'step': 2076, 'epoch': 3} {'type': 'loss', 'content': 0.004401656799018383, 'timestamp': '2025-10-01 04:13:44.782824', 'step': 2077, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:44.814464', 'step': 2077, 'epoch': 3} {'type': 'loss', 'content': 0.015684420242905617, 'timestamp': '2025-10-01 04:13:44.816835', 'step': 2078, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.847840', 'step': 2078, 'epoch': 3} {'type': 'loss', 'content': 0.01610006019473076, 'timestamp': '2025-10-01 04:13:44.850063', 'step': 2079, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.880699', 'step': 2079, 'epoch': 3} {'type': 'loss', 'content': 0.00259937415830791, 'timestamp': '2025-10-01 04:13:44.904664', 'step': 2080, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:44.934179', 'step': 2080, 'epoch': 3} {'type': 'loss', 'content': 0.006803158205002546, 'timestamp': '2025-10-01 04:13:44.936653', 'step': 2081, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.966449', 'step': 2081, 'epoch': 3} {'type': 'loss', 'content': 0.012734351679682732, 'timestamp': '2025-10-01 04:13:44.969008', 'step': 2082, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:44.999155', 'step': 2082, 'epoch': 3} {'type': 'loss', 'content': 0.005645200610160828, 'timestamp': '2025-10-01 04:13:45.002756', 'step': 2083, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:45.032700', 'step': 2083, 'epoch': 3} {'type': 'loss', 'content': 0.002019307343289256, 'timestamp': '2025-10-01 04:13:45.056534', 'step': 2084, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:45.086792', 'step': 2084, 'epoch': 3} {'type': 'loss', 'content': 0.04551689699292183, 'timestamp': '2025-10-01 04:13:45.089082', 'step': 2085, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:45.124508', 'step': 2085, 'epoch': 3} {'type': 'loss', 'content': 0.012857954017817974, 'timestamp': '2025-10-01 04:13:45.126591', 'step': 2086, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:45.156515', 'step': 2086, 'epoch': 3} {'type': 'loss', 'content': 0.006605700124055147, 'timestamp': '2025-10-01 04:13:45.159999', 'step': 2087, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:45.190392', 'step': 2087, 'epoch': 3} {'type': 'loss', 'content': 0.008717053569853306, 'timestamp': '2025-10-01 04:13:45.214408', 'step': 2088, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:45.244771', 'step': 2088, 'epoch': 3} {'type': 'loss', 'content': 0.0019231383921578526, 'timestamp': '2025-10-01 04:13:45.247092', 'step': 2089, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:13:45.278038', 'step': 2089, 'epoch': 3} {'type': 'loss', 'content': 0.0013605657732114196, 'timestamp': '2025-10-01 04:13:45.281072', 'step': 2090, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:45.310793', 'step': 2090, 'epoch': 3} {'type': 'loss', 'content': 0.0019781345035880804, 'timestamp': '2025-10-01 04:13:45.313165', 'step': 2091, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:45.343456', 'step': 2091, 'epoch': 3} {'type': 'loss', 'content': 0.011783729307353497, 'timestamp': '2025-10-01 04:13:45.367477', 'step': 2092, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:45.396984', 'step': 2092, 'epoch': 3} {'type': 'loss', 'content': 0.0034526768140494823, 'timestamp': '2025-10-01 04:13:45.401775', 'step': 2093, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:45.433859', 'step': 2093, 'epoch': 3} {'type': 'loss', 'content': 0.012783399783074856, 'timestamp': '2025-10-01 04:13:45.436583', 'step': 2094, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:45.467633', 'step': 2094, 'epoch': 3} {'type': 'loss', 'content': 0.003527685534209013, 'timestamp': '2025-10-01 04:13:45.470766', 'step': 2095, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:45.503424', 'step': 2095, 'epoch': 3} {'type': 'loss', 'content': 0.033071406185626984, 'timestamp': '2025-10-01 04:13:45.527184', 'step': 2096, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:45.559430', 'step': 2096, 'epoch': 3} {'type': 'loss', 'content': 0.0020220919977873564, 'timestamp': '2025-10-01 04:13:45.562264', 'step': 2097, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:45.595810', 'step': 2097, 'epoch': 3} {'type': 'loss', 'content': 0.021835412830114365, 'timestamp': '2025-10-01 04:13:45.598911', 'step': 2098, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:45.630928', 'step': 2098, 'epoch': 3} {'type': 'loss', 'content': 0.014858120121061802, 'timestamp': '2025-10-01 04:13:45.634070', 'step': 2099, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:45.666235', 'step': 2099, 'epoch': 3} {'type': 'loss', 'content': 0.006137244403362274, 'timestamp': '2025-10-01 04:13:45.690758', 'step': 2100, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:45.721698', 'step': 2100, 'epoch': 3} {'type': 'loss', 'content': 0.001460201689042151, 'timestamp': '2025-10-01 04:13:45.724550', 'step': 2101, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:45.756040', 'step': 2101, 'epoch': 3} {'type': 'loss', 'content': 0.0022680035326629877, 'timestamp': '2025-10-01 04:13:45.758784', 'step': 2102, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:45.791364', 'step': 2102, 'epoch': 3} {'type': 'loss', 'content': 0.015940619632601738, 'timestamp': '2025-10-01 04:13:45.794323', 'step': 2103, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:45.826211', 'step': 2103, 'epoch': 3} {'type': 'loss', 'content': 0.001429266412742436, 'timestamp': '2025-10-01 04:13:45.850814', 'step': 2104, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:45.883253', 'step': 2104, 'epoch': 3} {'type': 'loss', 'content': 0.0032171185594052076, 'timestamp': '2025-10-01 04:13:45.886173', 'step': 2105, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:45.918341', 'step': 2105, 'epoch': 3} {'type': 'loss', 'content': 0.004030963871628046, 'timestamp': '2025-10-01 04:13:45.921093', 'step': 2106, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:45.954527', 'step': 2106, 'epoch': 3} {'type': 'loss', 'content': 0.005968966521322727, 'timestamp': '2025-10-01 04:13:45.957054', 'step': 2107, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:45.991918', 'step': 2107, 'epoch': 3} {'type': 'loss', 'content': 0.009835876524448395, 'timestamp': '2025-10-01 04:13:46.015902', 'step': 2108, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:46.049093', 'step': 2108, 'epoch': 3} {'type': 'loss', 'content': 0.008758926764130592, 'timestamp': '2025-10-01 04:13:46.051296', 'step': 2109, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:13:46.863488', 'step': 2109, 'epoch': 3} {'type': 'pplx', 'content': 48746742.06245831, 'timestamp': '2025-10-01 04:13:46.866008', 'step': 2109, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:46.896272', 'step': 2109, 'epoch': 3} {'type': 'loss', 'content': 0.0032065026462078094, 'timestamp': '2025-10-01 04:13:46.899890', 'step': 2110, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:46.933939', 'step': 2110, 'epoch': 3} {'type': 'loss', 'content': 0.00044523560791276395, 'timestamp': '2025-10-01 04:13:46.937038', 'step': 2111, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:46.968805', 'step': 2111, 'epoch': 3} {'type': 'loss', 'content': 0.01161577831953764, 'timestamp': '2025-10-01 04:13:46.993753', 'step': 2112, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:47.025918', 'step': 2112, 'epoch': 3} {'type': 'loss', 'content': 0.004290744196623564, 'timestamp': '2025-10-01 04:13:47.028884', 'step': 2113, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:47.062758', 'step': 2113, 'epoch': 3} {'type': 'loss', 'content': 0.022032011300325394, 'timestamp': '2025-10-01 04:13:47.065644', 'step': 2114, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:47.097294', 'step': 2114, 'epoch': 3} {'type': 'loss', 'content': 0.0017666907515376806, 'timestamp': '2025-10-01 04:13:47.100008', 'step': 2115, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:47.131075', 'step': 2115, 'epoch': 3} {'type': 'loss', 'content': 0.003643403295427561, 'timestamp': '2025-10-01 04:13:47.155465', 'step': 2116, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:47.187837', 'step': 2116, 'epoch': 3} {'type': 'loss', 'content': 0.003369935555383563, 'timestamp': '2025-10-01 04:13:47.190500', 'step': 2117, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:47.222947', 'step': 2117, 'epoch': 3} {'type': 'loss', 'content': 0.03525031730532646, 'timestamp': '2025-10-01 04:13:47.226235', 'step': 2118, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:47.257844', 'step': 2118, 'epoch': 3} {'type': 'loss', 'content': 0.0019307138863950968, 'timestamp': '2025-10-01 04:13:47.262778', 'step': 2119, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:47.294941', 'step': 2119, 'epoch': 3} {'type': 'loss', 'content': 0.0005271370173431933, 'timestamp': '2025-10-01 04:13:47.319366', 'step': 2120, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:13:47.350854', 'step': 2120, 'epoch': 3} {'type': 'loss', 'content': 0.0013889552792534232, 'timestamp': '2025-10-01 04:13:47.354098', 'step': 2121, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:47.388632', 'step': 2121, 'epoch': 3} {'type': 'loss', 'content': 0.03613368794322014, 'timestamp': '2025-10-01 04:13:47.391371', 'step': 2122, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:47.422908', 'step': 2122, 'epoch': 3} {'type': 'loss', 'content': 0.02574986405670643, 'timestamp': '2025-10-01 04:13:47.424925', 'step': 2123, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:47.454375', 'step': 2123, 'epoch': 3} {'type': 'loss', 'content': 0.0014076424995437264, 'timestamp': '2025-10-01 04:13:47.478253', 'step': 2124, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:47.508211', 'step': 2124, 'epoch': 3} {'type': 'loss', 'content': 0.005204681307077408, 'timestamp': '2025-10-01 04:13:47.510895', 'step': 2125, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:47.541858', 'step': 2125, 'epoch': 3} {'type': 'loss', 'content': 0.005410588346421719, 'timestamp': '2025-10-01 04:13:47.544363', 'step': 2126, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:47.574928', 'step': 2126, 'epoch': 3} {'type': 'loss', 'content': 0.003872995963320136, 'timestamp': '2025-10-01 04:13:47.577864', 'step': 2127, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:47.607857', 'step': 2127, 'epoch': 3} {'type': 'loss', 'content': 0.016898931935429573, 'timestamp': '2025-10-01 04:13:47.631574', 'step': 2128, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:47.661728', 'step': 2128, 'epoch': 3} {'type': 'loss', 'content': 0.009217137470841408, 'timestamp': '2025-10-01 04:13:47.664119', 'step': 2129, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:47.694288', 'step': 2129, 'epoch': 3} {'type': 'loss', 'content': 0.002619354287162423, 'timestamp': '2025-10-01 04:13:47.697060', 'step': 2130, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:47.726556', 'step': 2130, 'epoch': 3} {'type': 'loss', 'content': 0.0029307217337191105, 'timestamp': '2025-10-01 04:13:47.728790', 'step': 2131, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:47.758161', 'step': 2131, 'epoch': 3} {'type': 'loss', 'content': 0.008102916181087494, 'timestamp': '2025-10-01 04:13:47.783415', 'step': 2132, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:47.819274', 'step': 2132, 'epoch': 3} {'type': 'loss', 'content': 0.02566385827958584, 'timestamp': '2025-10-01 04:13:47.822776', 'step': 2133, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:47.852589', 'step': 2133, 'epoch': 3} {'type': 'loss', 'content': 0.001259711803868413, 'timestamp': '2025-10-01 04:13:47.854906', 'step': 2134, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:47.885246', 'step': 2134, 'epoch': 3} {'type': 'loss', 'content': 0.0018119417363777757, 'timestamp': '2025-10-01 04:13:47.887927', 'step': 2135, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:47.920539', 'step': 2135, 'epoch': 3} {'type': 'loss', 'content': 0.0007317255367524922, 'timestamp': '2025-10-01 04:13:47.944256', 'step': 2136, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:47.975686', 'step': 2136, 'epoch': 3} {'type': 'loss', 'content': 0.003940492402762175, 'timestamp': '2025-10-01 04:13:47.977981', 'step': 2137, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.009006', 'step': 2137, 'epoch': 3} {'type': 'loss', 'content': 0.006948811002075672, 'timestamp': '2025-10-01 04:13:48.011279', 'step': 2138, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.042415', 'step': 2138, 'epoch': 3} {'type': 'loss', 'content': 0.0008612987585365772, 'timestamp': '2025-10-01 04:13:48.044758', 'step': 2139, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.075121', 'step': 2139, 'epoch': 3} {'type': 'loss', 'content': 0.002451820531859994, 'timestamp': '2025-10-01 04:13:48.099113', 'step': 2140, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.129880', 'step': 2140, 'epoch': 3} {'type': 'loss', 'content': 0.0021910052746534348, 'timestamp': '2025-10-01 04:13:48.132202', 'step': 2141, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.162599', 'step': 2141, 'epoch': 3} {'type': 'loss', 'content': 0.0032072842586785555, 'timestamp': '2025-10-01 04:13:48.165034', 'step': 2142, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.196290', 'step': 2142, 'epoch': 3} {'type': 'loss', 'content': 0.0010801416356116533, 'timestamp': '2025-10-01 04:13:48.199115', 'step': 2143, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.229594', 'step': 2143, 'epoch': 3} {'type': 'loss', 'content': 0.0031351468060165644, 'timestamp': '2025-10-01 04:13:48.253294', 'step': 2144, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.283374', 'step': 2144, 'epoch': 3} {'type': 'loss', 'content': 0.0016169316368177533, 'timestamp': '2025-10-01 04:13:48.285652', 'step': 2145, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.315534', 'step': 2145, 'epoch': 3} {'type': 'loss', 'content': 0.000727647275198251, 'timestamp': '2025-10-01 04:13:48.317900', 'step': 2146, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.349058', 'step': 2146, 'epoch': 3} {'type': 'loss', 'content': 0.010667803697288036, 'timestamp': '2025-10-01 04:13:48.351754', 'step': 2147, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.381734', 'step': 2147, 'epoch': 3} {'type': 'loss', 'content': 0.0024543162435293198, 'timestamp': '2025-10-01 04:13:48.405543', 'step': 2148, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.436106', 'step': 2148, 'epoch': 3} {'type': 'loss', 'content': 0.004013399593532085, 'timestamp': '2025-10-01 04:13:48.438645', 'step': 2149, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.468985', 'step': 2149, 'epoch': 3} {'type': 'loss', 'content': 0.00043969464604742825, 'timestamp': '2025-10-01 04:13:48.471292', 'step': 2150, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.501856', 'step': 2150, 'epoch': 3} {'type': 'loss', 'content': 0.012558751739561558, 'timestamp': '2025-10-01 04:13:48.504957', 'step': 2151, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:48.535528', 'step': 2151, 'epoch': 3} {'type': 'loss', 'content': 0.003782300977036357, 'timestamp': '2025-10-01 04:13:48.560637', 'step': 2152, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.591438', 'step': 2152, 'epoch': 3} {'type': 'loss', 'content': 0.0036086090840399265, 'timestamp': '2025-10-01 04:13:48.593827', 'step': 2153, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:48.623692', 'step': 2153, 'epoch': 3} {'type': 'loss', 'content': 0.0017416387563571334, 'timestamp': '2025-10-01 04:13:48.626270', 'step': 2154, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:48.657538', 'step': 2154, 'epoch': 3} {'type': 'loss', 'content': 0.002613009186461568, 'timestamp': '2025-10-01 04:13:48.660086', 'step': 2155, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.691023', 'step': 2155, 'epoch': 3} {'type': 'loss', 'content': 0.001628889818675816, 'timestamp': '2025-10-01 04:13:48.714753', 'step': 2156, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.744799', 'step': 2156, 'epoch': 3} {'type': 'loss', 'content': 0.004056987352669239, 'timestamp': '2025-10-01 04:13:48.746853', 'step': 2157, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:48.776634', 'step': 2157, 'epoch': 3} {'type': 'loss', 'content': 0.000895503384526819, 'timestamp': '2025-10-01 04:13:48.779343', 'step': 2158, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:48.810105', 'step': 2158, 'epoch': 3} {'type': 'loss', 'content': 0.003871302818879485, 'timestamp': '2025-10-01 04:13:48.812770', 'step': 2159, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.842931', 'step': 2159, 'epoch': 3} {'type': 'loss', 'content': 0.0022862900514155626, 'timestamp': '2025-10-01 04:13:48.866729', 'step': 2160, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.896942', 'step': 2160, 'epoch': 3} {'type': 'loss', 'content': 0.007540857885032892, 'timestamp': '2025-10-01 04:13:48.899302', 'step': 2161, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:13:48.930076', 'step': 2161, 'epoch': 3} {'type': 'loss', 'content': 0.02161705680191517, 'timestamp': '2025-10-01 04:13:48.932998', 'step': 2162, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:48.963775', 'step': 2162, 'epoch': 3} {'type': 'loss', 'content': 0.011639682576060295, 'timestamp': '2025-10-01 04:13:48.966183', 'step': 2163, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:48.996938', 'step': 2163, 'epoch': 3} {'type': 'loss', 'content': 0.005038486327975988, 'timestamp': '2025-10-01 04:13:49.020652', 'step': 2164, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:49.052583', 'step': 2164, 'epoch': 3} {'type': 'loss', 'content': 0.0005112849758006632, 'timestamp': '2025-10-01 04:13:49.054967', 'step': 2165, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:49.085636', 'step': 2165, 'epoch': 3} {'type': 'loss', 'content': 0.0019404730992391706, 'timestamp': '2025-10-01 04:13:49.088156', 'step': 2166, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:13:49.839197', 'step': 2166, 'epoch': 3} {'type': 'pplx', 'content': 52580040.12786619, 'timestamp': '2025-10-01 04:13:49.841341', 'step': 2166, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:49.871723', 'step': 2166, 'epoch': 3} {'type': 'loss', 'content': 0.021567419171333313, 'timestamp': '2025-10-01 04:13:49.874363', 'step': 2167, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:49.905190', 'step': 2167, 'epoch': 3} {'type': 'loss', 'content': 0.0038097966462373734, 'timestamp': '2025-10-01 04:13:49.929270', 'step': 2168, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:49.959826', 'step': 2168, 'epoch': 3} {'type': 'loss', 'content': 0.001439273008145392, 'timestamp': '2025-10-01 04:13:49.962113', 'step': 2169, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:13:49.993800', 'step': 2169, 'epoch': 3} {'type': 'loss', 'content': 0.0005533623043447733, 'timestamp': '2025-10-01 04:13:49.996628', 'step': 2170, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:50.028080', 'step': 2170, 'epoch': 3} {'type': 'loss', 'content': 0.0012496901908889413, 'timestamp': '2025-10-01 04:13:50.030672', 'step': 2171, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:50.061894', 'step': 2171, 'epoch': 3} {'type': 'loss', 'content': 0.0007430757395923138, 'timestamp': '2025-10-01 04:13:50.090651', 'step': 2172, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:50.130287', 'step': 2172, 'epoch': 3} {'type': 'loss', 'content': 0.0024324983824044466, 'timestamp': '2025-10-01 04:13:50.132440', 'step': 2173, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:50.162682', 'step': 2173, 'epoch': 3} {'type': 'loss', 'content': 0.015526972711086273, 'timestamp': '2025-10-01 04:13:50.165032', 'step': 2174, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:50.196207', 'step': 2174, 'epoch': 3} {'type': 'loss', 'content': 0.003723499597981572, 'timestamp': '2025-10-01 04:13:50.200706', 'step': 2175, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:50.232248', 'step': 2175, 'epoch': 3} {'type': 'loss', 'content': 0.0009116663713939488, 'timestamp': '2025-10-01 04:13:50.256033', 'step': 2176, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:50.287948', 'step': 2176, 'epoch': 3} {'type': 'loss', 'content': 0.0016930032288655639, 'timestamp': '2025-10-01 04:13:50.290258', 'step': 2177, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:50.319935', 'step': 2177, 'epoch': 3} {'type': 'loss', 'content': 0.07970846444368362, 'timestamp': '2025-10-01 04:13:50.322171', 'step': 2178, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:50.352874', 'step': 2178, 'epoch': 3} {'type': 'loss', 'content': 0.0005008368170820177, 'timestamp': '2025-10-01 04:13:50.355091', 'step': 2179, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:50.385190', 'step': 2179, 'epoch': 3} {'type': 'loss', 'content': 0.0017755134031176567, 'timestamp': '2025-10-01 04:13:50.409568', 'step': 2180, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:50.439865', 'step': 2180, 'epoch': 3} {'type': 'loss', 'content': 0.0006339551182463765, 'timestamp': '2025-10-01 04:13:50.442350', 'step': 2181, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:50.472507', 'step': 2181, 'epoch': 3} {'type': 'loss', 'content': 0.0005257153534330428, 'timestamp': '2025-10-01 04:13:50.475103', 'step': 2182, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:50.505242', 'step': 2182, 'epoch': 3} {'type': 'loss', 'content': 0.0016372958198189735, 'timestamp': '2025-10-01 04:13:50.507994', 'step': 2183, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:50.538273', 'step': 2183, 'epoch': 3} {'type': 'loss', 'content': 0.0017383432714268565, 'timestamp': '2025-10-01 04:13:50.561965', 'step': 2184, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:50.592857', 'step': 2184, 'epoch': 3} {'type': 'loss', 'content': 0.01293744146823883, 'timestamp': '2025-10-01 04:13:50.595401', 'step': 2185, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:50.625952', 'step': 2185, 'epoch': 3} {'type': 'loss', 'content': 0.0013602704275399446, 'timestamp': '2025-10-01 04:13:50.628065', 'step': 2186, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:50.660109', 'step': 2186, 'epoch': 3} {'type': 'loss', 'content': 0.029963940382003784, 'timestamp': '2025-10-01 04:13:50.662355', 'step': 2187, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:50.692996', 'step': 2187, 'epoch': 3} {'type': 'loss', 'content': 0.001546170562505722, 'timestamp': '2025-10-01 04:13:50.716697', 'step': 2188, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:50.746866', 'step': 2188, 'epoch': 3} {'type': 'loss', 'content': 0.0025659650564193726, 'timestamp': '2025-10-01 04:13:50.749083', 'step': 2189, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:50.779294', 'step': 2189, 'epoch': 3} {'type': 'loss', 'content': 0.0019798476714640856, 'timestamp': '2025-10-01 04:13:50.781579', 'step': 2190, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:50.816634', 'step': 2190, 'epoch': 3} {'type': 'loss', 'content': 0.021714312955737114, 'timestamp': '2025-10-01 04:13:50.819334', 'step': 2191, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:50.849601', 'step': 2191, 'epoch': 3} {'type': 'loss', 'content': 0.0006570308469235897, 'timestamp': '2025-10-01 04:13:50.873553', 'step': 2192, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:50.903563', 'step': 2192, 'epoch': 3} {'type': 'loss', 'content': 0.0003911609819624573, 'timestamp': '2025-10-01 04:13:50.906388', 'step': 2193, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:50.936361', 'step': 2193, 'epoch': 3} {'type': 'loss', 'content': 0.012683354318141937, 'timestamp': '2025-10-01 04:13:50.938846', 'step': 2194, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:50.969034', 'step': 2194, 'epoch': 3} {'type': 'loss', 'content': 0.0007620817050337791, 'timestamp': '2025-10-01 04:13:50.971696', 'step': 2195, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.002373', 'step': 2195, 'epoch': 3} {'type': 'loss', 'content': 0.04474819079041481, 'timestamp': '2025-10-01 04:13:51.025900', 'step': 2196, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:51.056712', 'step': 2196, 'epoch': 3} {'type': 'loss', 'content': 0.02375408448278904, 'timestamp': '2025-10-01 04:13:51.058967', 'step': 2197, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.089240', 'step': 2197, 'epoch': 3} {'type': 'loss', 'content': 0.003066360717639327, 'timestamp': '2025-10-01 04:13:51.091440', 'step': 2198, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.121521', 'step': 2198, 'epoch': 3} {'type': 'loss', 'content': 0.0017424180405214429, 'timestamp': '2025-10-01 04:13:51.124399', 'step': 2199, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:51.155109', 'step': 2199, 'epoch': 3} {'type': 'loss', 'content': 0.0005147810443304479, 'timestamp': '2025-10-01 04:13:51.178823', 'step': 2200, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:13:51.209488', 'step': 2200, 'epoch': 3} {'type': 'loss', 'content': 0.0011206363560631871, 'timestamp': '2025-10-01 04:13:51.214386', 'step': 2201, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.245040', 'step': 2201, 'epoch': 3} {'type': 'loss', 'content': 0.014846226200461388, 'timestamp': '2025-10-01 04:13:51.247451', 'step': 2202, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.277363', 'step': 2202, 'epoch': 3} {'type': 'loss', 'content': 0.0005189668736420572, 'timestamp': '2025-10-01 04:13:51.280000', 'step': 2203, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:51.310539', 'step': 2203, 'epoch': 3} {'type': 'loss', 'content': 0.0006393748917616904, 'timestamp': '2025-10-01 04:13:51.334449', 'step': 2204, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.366533', 'step': 2204, 'epoch': 3} {'type': 'loss', 'content': 0.001081749564036727, 'timestamp': '2025-10-01 04:13:51.368875', 'step': 2205, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.398790', 'step': 2205, 'epoch': 3} {'type': 'loss', 'content': 0.0028890289831906557, 'timestamp': '2025-10-01 04:13:51.401159', 'step': 2206, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:51.432261', 'step': 2206, 'epoch': 3} {'type': 'loss', 'content': 0.0013795385602861643, 'timestamp': '2025-10-01 04:13:51.434848', 'step': 2207, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.465835', 'step': 2207, 'epoch': 3} {'type': 'loss', 'content': 0.006973086390644312, 'timestamp': '2025-10-01 04:13:51.490177', 'step': 2208, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.521034', 'step': 2208, 'epoch': 3} {'type': 'loss', 'content': 0.0016343960305675864, 'timestamp': '2025-10-01 04:13:51.523305', 'step': 2209, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.554179', 'step': 2209, 'epoch': 3} {'type': 'loss', 'content': 0.0010523818200454116, 'timestamp': '2025-10-01 04:13:51.557872', 'step': 2210, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.592688', 'step': 2210, 'epoch': 3} {'type': 'loss', 'content': 0.0007708879420533776, 'timestamp': '2025-10-01 04:13:51.594986', 'step': 2211, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.624798', 'step': 2211, 'epoch': 3} {'type': 'loss', 'content': 0.001242365688085556, 'timestamp': '2025-10-01 04:13:51.648776', 'step': 2212, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.683344', 'step': 2212, 'epoch': 3} {'type': 'loss', 'content': 0.009481081739068031, 'timestamp': '2025-10-01 04:13:51.686810', 'step': 2213, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:51.718562', 'step': 2213, 'epoch': 3} {'type': 'loss', 'content': 0.0021511695813387632, 'timestamp': '2025-10-01 04:13:51.721237', 'step': 2214, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.751752', 'step': 2214, 'epoch': 3} {'type': 'loss', 'content': 0.002733413130044937, 'timestamp': '2025-10-01 04:13:51.753880', 'step': 2215, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:51.783955', 'step': 2215, 'epoch': 3} {'type': 'loss', 'content': 0.017588740214705467, 'timestamp': '2025-10-01 04:13:51.808100', 'step': 2216, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.838417', 'step': 2216, 'epoch': 3} {'type': 'loss', 'content': 0.0065732793882489204, 'timestamp': '2025-10-01 04:13:51.841084', 'step': 2217, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:51.872494', 'step': 2217, 'epoch': 3} {'type': 'loss', 'content': 0.016586460173130035, 'timestamp': '2025-10-01 04:13:51.875321', 'step': 2218, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:51.912047', 'step': 2218, 'epoch': 3} {'type': 'loss', 'content': 0.0012413633521646261, 'timestamp': '2025-10-01 04:13:51.914928', 'step': 2219, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:51.945079', 'step': 2219, 'epoch': 3} {'type': 'loss', 'content': 0.0022606845013797283, 'timestamp': '2025-10-01 04:13:51.970122', 'step': 2220, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:52.000245', 'step': 2220, 'epoch': 3} {'type': 'loss', 'content': 0.000476359884487465, 'timestamp': '2025-10-01 04:13:52.002609', 'step': 2221, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:52.032633', 'step': 2221, 'epoch': 3} {'type': 'loss', 'content': 0.0018863745499402285, 'timestamp': '2025-10-01 04:13:52.034902', 'step': 2222, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:52.065407', 'step': 2222, 'epoch': 3} {'type': 'loss', 'content': 0.0005490041221491992, 'timestamp': '2025-10-01 04:13:52.068087', 'step': 2223, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:13:52.878486', 'step': 2223, 'epoch': 3} {'type': 'pplx', 'content': 57872429.750115015, 'timestamp': '2025-10-01 04:13:52.883516', 'step': 2223, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:52.915823', 'step': 2223, 'epoch': 3} {'type': 'loss', 'content': 0.005303422920405865, 'timestamp': '2025-10-01 04:13:52.939665', 'step': 2224, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:52.970698', 'step': 2224, 'epoch': 3} {'type': 'loss', 'content': 0.001239862642250955, 'timestamp': '2025-10-01 04:13:52.974647', 'step': 2225, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.004930', 'step': 2225, 'epoch': 3} {'type': 'loss', 'content': 0.0014868489233776927, 'timestamp': '2025-10-01 04:13:53.007349', 'step': 2226, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.039270', 'step': 2226, 'epoch': 3} {'type': 'loss', 'content': 0.014227977022528648, 'timestamp': '2025-10-01 04:13:53.041520', 'step': 2227, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:13:53.072837', 'step': 2227, 'epoch': 3} {'type': 'loss', 'content': 0.01482341904193163, 'timestamp': '2025-10-01 04:13:53.096873', 'step': 2228, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.127041', 'step': 2228, 'epoch': 3} {'type': 'loss', 'content': 0.00040583201916888356, 'timestamp': '2025-10-01 04:13:53.129207', 'step': 2229, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.159992', 'step': 2229, 'epoch': 3} {'type': 'loss', 'content': 0.001420902437530458, 'timestamp': '2025-10-01 04:13:53.163552', 'step': 2230, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.196656', 'step': 2230, 'epoch': 3} {'type': 'loss', 'content': 0.00013874395517632365, 'timestamp': '2025-10-01 04:13:53.198927', 'step': 2231, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:53.230201', 'step': 2231, 'epoch': 3} {'type': 'loss', 'content': 0.018192876130342484, 'timestamp': '2025-10-01 04:13:53.254018', 'step': 2232, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.284629', 'step': 2232, 'epoch': 3} {'type': 'loss', 'content': 0.0024055861867964268, 'timestamp': '2025-10-01 04:13:53.286870', 'step': 2233, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.319096', 'step': 2233, 'epoch': 3} {'type': 'loss', 'content': 0.0012060150038450956, 'timestamp': '2025-10-01 04:13:53.321182', 'step': 2234, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.351973', 'step': 2234, 'epoch': 3} {'type': 'loss', 'content': 0.001272205961868167, 'timestamp': '2025-10-01 04:13:53.354100', 'step': 2235, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:53.385556', 'step': 2235, 'epoch': 3} {'type': 'loss', 'content': 0.0032269915100187063, 'timestamp': '2025-10-01 04:13:53.412538', 'step': 2236, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:53.443382', 'step': 2236, 'epoch': 3} {'type': 'loss', 'content': 0.001299424795433879, 'timestamp': '2025-10-01 04:13:53.445926', 'step': 2237, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.477086', 'step': 2237, 'epoch': 3} {'type': 'loss', 'content': 0.0036020518746227026, 'timestamp': '2025-10-01 04:13:53.479343', 'step': 2238, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:53.511060', 'step': 2238, 'epoch': 3} {'type': 'loss', 'content': 0.027935948222875595, 'timestamp': '2025-10-01 04:13:53.513294', 'step': 2239, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:53.544020', 'step': 2239, 'epoch': 3} {'type': 'loss', 'content': 0.003648341167718172, 'timestamp': '2025-10-01 04:13:53.567927', 'step': 2240, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.599405', 'step': 2240, 'epoch': 3} {'type': 'loss', 'content': 0.0017400241922587156, 'timestamp': '2025-10-01 04:13:53.601898', 'step': 2241, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:53.633111', 'step': 2241, 'epoch': 3} {'type': 'loss', 'content': 0.00026442264788784087, 'timestamp': '2025-10-01 04:13:53.635470', 'step': 2242, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.666920', 'step': 2242, 'epoch': 3} {'type': 'loss', 'content': 0.0010703576263040304, 'timestamp': '2025-10-01 04:13:53.669857', 'step': 2243, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.700975', 'step': 2243, 'epoch': 3} {'type': 'loss', 'content': 0.0018084843177348375, 'timestamp': '2025-10-01 04:13:53.724547', 'step': 2244, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:53.754331', 'step': 2244, 'epoch': 3} {'type': 'loss', 'content': 0.004137086682021618, 'timestamp': '2025-10-01 04:13:53.756585', 'step': 2245, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:53.787015', 'step': 2245, 'epoch': 3} {'type': 'loss', 'content': 0.004625528585165739, 'timestamp': '2025-10-01 04:13:53.789622', 'step': 2246, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.819494', 'step': 2246, 'epoch': 3} {'type': 'loss', 'content': 0.002892305376008153, 'timestamp': '2025-10-01 04:13:53.821935', 'step': 2247, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.852186', 'step': 2247, 'epoch': 3} {'type': 'loss', 'content': 0.010060891509056091, 'timestamp': '2025-10-01 04:13:53.876285', 'step': 2248, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.908538', 'step': 2248, 'epoch': 3} {'type': 'loss', 'content': 0.010718696750700474, 'timestamp': '2025-10-01 04:13:53.911889', 'step': 2249, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.943796', 'step': 2249, 'epoch': 3} {'type': 'loss', 'content': 0.0001337795110885054, 'timestamp': '2025-10-01 04:13:53.946157', 'step': 2250, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:53.976787', 'step': 2250, 'epoch': 3} {'type': 'loss', 'content': 0.004378857556730509, 'timestamp': '2025-10-01 04:13:53.979403', 'step': 2251, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.009828', 'step': 2251, 'epoch': 3} {'type': 'loss', 'content': 0.005483386572450399, 'timestamp': '2025-10-01 04:13:54.033818', 'step': 2252, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.065109', 'step': 2252, 'epoch': 3} {'type': 'loss', 'content': 0.0017293499549850821, 'timestamp': '2025-10-01 04:13:54.067304', 'step': 2253, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:54.100189', 'step': 2253, 'epoch': 3} {'type': 'loss', 'content': 0.0008020902168937027, 'timestamp': '2025-10-01 04:13:54.102844', 'step': 2254, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.133014', 'step': 2254, 'epoch': 3} {'type': 'loss', 'content': 0.006860501132905483, 'timestamp': '2025-10-01 04:13:54.135596', 'step': 2255, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:54.165944', 'step': 2255, 'epoch': 3} {'type': 'loss', 'content': 0.00011491885379655287, 'timestamp': '2025-10-01 04:13:54.189484', 'step': 2256, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:54.220908', 'step': 2256, 'epoch': 3} {'type': 'loss', 'content': 0.0001829215616453439, 'timestamp': '2025-10-01 04:13:54.223228', 'step': 2257, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:54.255147', 'step': 2257, 'epoch': 3} {'type': 'loss', 'content': 0.006004713010042906, 'timestamp': '2025-10-01 04:13:54.257397', 'step': 2258, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:54.288126', 'step': 2258, 'epoch': 3} {'type': 'loss', 'content': 0.0025731585919857025, 'timestamp': '2025-10-01 04:13:54.290963', 'step': 2259, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:54.321128', 'step': 2259, 'epoch': 3} {'type': 'loss', 'content': 0.005405474919825792, 'timestamp': '2025-10-01 04:13:54.344849', 'step': 2260, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.376244', 'step': 2260, 'epoch': 3} {'type': 'loss', 'content': 0.001444359077140689, 'timestamp': '2025-10-01 04:13:54.378603', 'step': 2261, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.408929', 'step': 2261, 'epoch': 3} {'type': 'loss', 'content': 0.0011323849903419614, 'timestamp': '2025-10-01 04:13:54.411072', 'step': 2262, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.440934', 'step': 2262, 'epoch': 3} {'type': 'loss', 'content': 0.004098639357835054, 'timestamp': '2025-10-01 04:13:54.443178', 'step': 2263, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:54.474479', 'step': 2263, 'epoch': 3} {'type': 'loss', 'content': 0.0014724934007972479, 'timestamp': '2025-10-01 04:13:54.498620', 'step': 2264, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.528900', 'step': 2264, 'epoch': 3} {'type': 'loss', 'content': 0.00010470008419360965, 'timestamp': '2025-10-01 04:13:54.531177', 'step': 2265, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:54.561929', 'step': 2265, 'epoch': 3} {'type': 'loss', 'content': 0.0005662904004566371, 'timestamp': '2025-10-01 04:13:54.564673', 'step': 2266, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.595382', 'step': 2266, 'epoch': 3} {'type': 'loss', 'content': 0.02000434696674347, 'timestamp': '2025-10-01 04:13:54.597915', 'step': 2267, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:13:54.628014', 'step': 2267, 'epoch': 3} {'type': 'loss', 'content': 0.003685658099129796, 'timestamp': '2025-10-01 04:13:54.651964', 'step': 2268, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.681973', 'step': 2268, 'epoch': 3} {'type': 'loss', 'content': 0.00028314744122326374, 'timestamp': '2025-10-01 04:13:54.684005', 'step': 2269, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.715355', 'step': 2269, 'epoch': 3} {'type': 'loss', 'content': 0.0002110866189468652, 'timestamp': '2025-10-01 04:13:54.717723', 'step': 2270, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.748149', 'step': 2270, 'epoch': 3} {'type': 'loss', 'content': 0.000801309070084244, 'timestamp': '2025-10-01 04:13:54.750891', 'step': 2271, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.781274', 'step': 2271, 'epoch': 3} {'type': 'loss', 'content': 0.0008623714675195515, 'timestamp': '2025-10-01 04:13:54.805287', 'step': 2272, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.837178', 'step': 2272, 'epoch': 3} {'type': 'loss', 'content': 0.00174813368357718, 'timestamp': '2025-10-01 04:13:54.839371', 'step': 2273, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.869678', 'step': 2273, 'epoch': 3} {'type': 'loss', 'content': 0.001442483626306057, 'timestamp': '2025-10-01 04:13:54.872114', 'step': 2274, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.902878', 'step': 2274, 'epoch': 3} {'type': 'loss', 'content': 0.0007527487468905747, 'timestamp': '2025-10-01 04:13:54.905230', 'step': 2275, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:54.936627', 'step': 2275, 'epoch': 3} {'type': 'loss', 'content': 0.0018074663821607828, 'timestamp': '2025-10-01 04:13:54.960309', 'step': 2276, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:54.993177', 'step': 2276, 'epoch': 3} {'type': 'loss', 'content': 0.00037215909105725586, 'timestamp': '2025-10-01 04:13:54.995943', 'step': 2277, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:55.027011', 'step': 2277, 'epoch': 3} {'type': 'loss', 'content': 0.01679880917072296, 'timestamp': '2025-10-01 04:13:55.029564', 'step': 2278, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:55.060916', 'step': 2278, 'epoch': 3} {'type': 'loss', 'content': 0.0008941438281908631, 'timestamp': '2025-10-01 04:13:55.063248', 'step': 2279, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:55.094396', 'step': 2279, 'epoch': 3} {'type': 'loss', 'content': 0.0006860237335786223, 'timestamp': '2025-10-01 04:13:55.118517', 'step': 2280, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:13:55.947999', 'step': 2280, 'epoch': 3} {'type': 'pplx', 'content': 66637274.69816364, 'timestamp': '2025-10-01 04:13:55.950876', 'step': 2280, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:55.981527', 'step': 2280, 'epoch': 3} {'type': 'loss', 'content': 0.0006886274204589427, 'timestamp': '2025-10-01 04:13:55.984672', 'step': 2281, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.015998', 'step': 2281, 'epoch': 3} {'type': 'loss', 'content': 0.004052779637277126, 'timestamp': '2025-10-01 04:13:56.018978', 'step': 2282, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:56.051788', 'step': 2282, 'epoch': 3} {'type': 'loss', 'content': 0.00020752070122398436, 'timestamp': '2025-10-01 04:13:56.054491', 'step': 2283, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.088631', 'step': 2283, 'epoch': 3} {'type': 'loss', 'content': 0.0004296154365874827, 'timestamp': '2025-10-01 04:13:56.114021', 'step': 2284, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.148318', 'step': 2284, 'epoch': 3} {'type': 'loss', 'content': 0.0006741755059920251, 'timestamp': '2025-10-01 04:13:56.151630', 'step': 2285, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.184792', 'step': 2285, 'epoch': 3} {'type': 'loss', 'content': 0.0002825469709932804, 'timestamp': '2025-10-01 04:13:56.188132', 'step': 2286, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.221273', 'step': 2286, 'epoch': 3} {'type': 'loss', 'content': 0.00020196287368889898, 'timestamp': '2025-10-01 04:13:56.224908', 'step': 2287, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:13:56.258003', 'step': 2287, 'epoch': 3} {'type': 'loss', 'content': 0.0006172371795400977, 'timestamp': '2025-10-01 04:13:56.284258', 'step': 2288, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:56.326859', 'step': 2288, 'epoch': 3} {'type': 'loss', 'content': 0.00020179711282253265, 'timestamp': '2025-10-01 04:13:56.330074', 'step': 2289, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.363187', 'step': 2289, 'epoch': 3} {'type': 'loss', 'content': 0.0001798906596377492, 'timestamp': '2025-10-01 04:13:56.366055', 'step': 2290, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.398175', 'step': 2290, 'epoch': 3} {'type': 'loss', 'content': 0.0007496423204429448, 'timestamp': '2025-10-01 04:13:56.401033', 'step': 2291, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:56.433795', 'step': 2291, 'epoch': 3} {'type': 'loss', 'content': 0.0031232612673193216, 'timestamp': '2025-10-01 04:13:56.458085', 'step': 2292, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.490985', 'step': 2292, 'epoch': 3} {'type': 'loss', 'content': 0.00034845713526010513, 'timestamp': '2025-10-01 04:13:56.493978', 'step': 2293, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:56.527730', 'step': 2293, 'epoch': 3} {'type': 'loss', 'content': 0.002042332198470831, 'timestamp': '2025-10-01 04:13:56.530788', 'step': 2294, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.564274', 'step': 2294, 'epoch': 3} {'type': 'loss', 'content': 0.0036737422924488783, 'timestamp': '2025-10-01 04:13:56.567665', 'step': 2295, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.599395', 'step': 2295, 'epoch': 3} {'type': 'loss', 'content': 0.00028754607774317265, 'timestamp': '2025-10-01 04:13:56.623419', 'step': 2296, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.656536', 'step': 2296, 'epoch': 3} {'type': 'loss', 'content': 0.0005289117689244449, 'timestamp': '2025-10-01 04:13:56.659456', 'step': 2297, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.691957', 'step': 2297, 'epoch': 3} {'type': 'loss', 'content': 0.001023622928187251, 'timestamp': '2025-10-01 04:13:56.695324', 'step': 2298, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.728713', 'step': 2298, 'epoch': 3} {'type': 'loss', 'content': 0.00013159470108803362, 'timestamp': '2025-10-01 04:13:56.731160', 'step': 2299, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.763634', 'step': 2299, 'epoch': 3} {'type': 'loss', 'content': 0.0003687890130095184, 'timestamp': '2025-10-01 04:13:56.788127', 'step': 2300, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.820633', 'step': 2300, 'epoch': 3} {'type': 'loss', 'content': 0.00016187668370548636, 'timestamp': '2025-10-01 04:13:56.823444', 'step': 2301, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.857460', 'step': 2301, 'epoch': 3} {'type': 'loss', 'content': 0.00014664800255559385, 'timestamp': '2025-10-01 04:13:56.859740', 'step': 2302, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.890101', 'step': 2302, 'epoch': 3} {'type': 'loss', 'content': 0.0200524739921093, 'timestamp': '2025-10-01 04:13:56.892366', 'step': 2303, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.922653', 'step': 2303, 'epoch': 3} {'type': 'loss', 'content': 0.0009536169818602502, 'timestamp': '2025-10-01 04:13:56.946284', 'step': 2304, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:56.977135', 'step': 2304, 'epoch': 3} {'type': 'loss', 'content': 0.00014004905824549496, 'timestamp': '2025-10-01 04:13:56.979796', 'step': 2305, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.011129', 'step': 2305, 'epoch': 3} {'type': 'loss', 'content': 0.0004129658918827772, 'timestamp': '2025-10-01 04:13:57.014211', 'step': 2306, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:57.044673', 'step': 2306, 'epoch': 3} {'type': 'loss', 'content': 0.0029463896062225103, 'timestamp': '2025-10-01 04:13:57.047446', 'step': 2307, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:57.078767', 'step': 2307, 'epoch': 3} {'type': 'loss', 'content': 0.0015465685864910483, 'timestamp': '2025-10-01 04:13:57.103049', 'step': 2308, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:57.134295', 'step': 2308, 'epoch': 3} {'type': 'loss', 'content': 0.00033053886727429926, 'timestamp': '2025-10-01 04:13:57.136656', 'step': 2309, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.167490', 'step': 2309, 'epoch': 3} {'type': 'loss', 'content': 0.0004535024636425078, 'timestamp': '2025-10-01 04:13:57.170113', 'step': 2310, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.200399', 'step': 2310, 'epoch': 3} {'type': 'loss', 'content': 0.00019416247960180044, 'timestamp': '2025-10-01 04:13:57.202965', 'step': 2311, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.233831', 'step': 2311, 'epoch': 3} {'type': 'loss', 'content': 0.000307450391119346, 'timestamp': '2025-10-01 04:13:57.257717', 'step': 2312, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.289110', 'step': 2312, 'epoch': 3} {'type': 'loss', 'content': 0.0004815698484890163, 'timestamp': '2025-10-01 04:13:57.291523', 'step': 2313, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.322062', 'step': 2313, 'epoch': 3} {'type': 'loss', 'content': 9.445106115890667e-05, 'timestamp': '2025-10-01 04:13:57.324496', 'step': 2314, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.355461', 'step': 2314, 'epoch': 3} {'type': 'loss', 'content': 0.00887442845851183, 'timestamp': '2025-10-01 04:13:57.358252', 'step': 2315, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.389065', 'step': 2315, 'epoch': 3} {'type': 'loss', 'content': 0.0002211143437307328, 'timestamp': '2025-10-01 04:13:57.412957', 'step': 2316, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:57.445065', 'step': 2316, 'epoch': 3} {'type': 'loss', 'content': 0.00011029041343135759, 'timestamp': '2025-10-01 04:13:57.447586', 'step': 2317, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.480235', 'step': 2317, 'epoch': 3} {'type': 'loss', 'content': 0.00045324538950808346, 'timestamp': '2025-10-01 04:13:57.482531', 'step': 2318, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:57.513974', 'step': 2318, 'epoch': 3} {'type': 'loss', 'content': 0.0002062932326225564, 'timestamp': '2025-10-01 04:13:57.516597', 'step': 2319, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.548161', 'step': 2319, 'epoch': 3} {'type': 'loss', 'content': 0.0019155825721099973, 'timestamp': '2025-10-01 04:13:57.572038', 'step': 2320, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:57.603052', 'step': 2320, 'epoch': 3} {'type': 'loss', 'content': 0.0013960811775177717, 'timestamp': '2025-10-01 04:13:57.605522', 'step': 2321, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.635696', 'step': 2321, 'epoch': 3} {'type': 'loss', 'content': 0.00019416447321418673, 'timestamp': '2025-10-01 04:13:57.638213', 'step': 2322, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.668572', 'step': 2322, 'epoch': 3} {'type': 'loss', 'content': 0.005333646200597286, 'timestamp': '2025-10-01 04:13:57.670929', 'step': 2323, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:57.701421', 'step': 2323, 'epoch': 3} {'type': 'loss', 'content': 0.00022433795675169677, 'timestamp': '2025-10-01 04:13:57.725539', 'step': 2324, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.756019', 'step': 2324, 'epoch': 3} {'type': 'loss', 'content': 0.0015846246387809515, 'timestamp': '2025-10-01 04:13:57.758910', 'step': 2325, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:57.790254', 'step': 2325, 'epoch': 3} {'type': 'loss', 'content': 0.001096657244488597, 'timestamp': '2025-10-01 04:13:57.792998', 'step': 2326, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.824209', 'step': 2326, 'epoch': 3} {'type': 'loss', 'content': 0.007495674304664135, 'timestamp': '2025-10-01 04:13:57.826585', 'step': 2327, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.857064', 'step': 2327, 'epoch': 3} {'type': 'loss', 'content': 0.007593679241836071, 'timestamp': '2025-10-01 04:13:57.880922', 'step': 2328, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.911681', 'step': 2328, 'epoch': 3} {'type': 'loss', 'content': 0.000178185073309578, 'timestamp': '2025-10-01 04:13:57.913868', 'step': 2329, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:57.943925', 'step': 2329, 'epoch': 3} {'type': 'loss', 'content': 0.00029983054264448583, 'timestamp': '2025-10-01 04:13:57.946258', 'step': 2330, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:57.976362', 'step': 2330, 'epoch': 3} {'type': 'loss', 'content': 0.0005218511214479804, 'timestamp': '2025-10-01 04:13:57.979122', 'step': 2331, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:58.014030', 'step': 2331, 'epoch': 3} {'type': 'loss', 'content': 0.0006095552816987038, 'timestamp': '2025-10-01 04:13:58.038114', 'step': 2332, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:58.069940', 'step': 2332, 'epoch': 3} {'type': 'loss', 'content': 0.0006886310293339193, 'timestamp': '2025-10-01 04:13:58.072385', 'step': 2333, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:58.107624', 'step': 2333, 'epoch': 3} {'type': 'loss', 'content': 0.0004724813625216484, 'timestamp': '2025-10-01 04:13:58.110276', 'step': 2334, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:58.142440', 'step': 2334, 'epoch': 3} {'type': 'loss', 'content': 0.0001418648025719449, 'timestamp': '2025-10-01 04:13:58.145038', 'step': 2335, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:58.176299', 'step': 2335, 'epoch': 3} {'type': 'loss', 'content': 0.007086599711328745, 'timestamp': '2025-10-01 04:13:58.200243', 'step': 2336, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:58.231939', 'step': 2336, 'epoch': 3} {'type': 'loss', 'content': 0.0019127464620396495, 'timestamp': '2025-10-01 04:13:58.234561', 'step': 2337, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:13:58.987132', 'step': 2337, 'epoch': 3} {'type': 'pplx', 'content': 69247012.9575338, 'timestamp': '2025-10-01 04:13:58.990008', 'step': 2337, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.019342', 'step': 2337, 'epoch': 3} {'type': 'loss', 'content': 0.0002083075523842126, 'timestamp': '2025-10-01 04:13:59.021841', 'step': 2338, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.052298', 'step': 2338, 'epoch': 3} {'type': 'loss', 'content': 8.348600385943428e-05, 'timestamp': '2025-10-01 04:13:59.054761', 'step': 2339, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.085523', 'step': 2339, 'epoch': 3} {'type': 'loss', 'content': 0.01127583347260952, 'timestamp': '2025-10-01 04:13:59.111836', 'step': 2340, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:59.142895', 'step': 2340, 'epoch': 3} {'type': 'loss', 'content': 0.008107397705316544, 'timestamp': '2025-10-01 04:13:59.145588', 'step': 2341, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:59.176979', 'step': 2341, 'epoch': 3} {'type': 'loss', 'content': 0.02429473027586937, 'timestamp': '2025-10-01 04:13:59.179351', 'step': 2342, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.211977', 'step': 2342, 'epoch': 3} {'type': 'loss', 'content': 0.00030902380240149796, 'timestamp': '2025-10-01 04:13:59.214422', 'step': 2343, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.244579', 'step': 2343, 'epoch': 3} {'type': 'loss', 'content': 0.0004113315953873098, 'timestamp': '2025-10-01 04:13:59.268573', 'step': 2344, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.300573', 'step': 2344, 'epoch': 3} {'type': 'loss', 'content': 0.046647004783153534, 'timestamp': '2025-10-01 04:13:59.303872', 'step': 2345, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:59.336321', 'step': 2345, 'epoch': 3} {'type': 'loss', 'content': 0.0014374825404956937, 'timestamp': '2025-10-01 04:13:59.339699', 'step': 2346, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.371559', 'step': 2346, 'epoch': 3} {'type': 'loss', 'content': 6.242156814550981e-05, 'timestamp': '2025-10-01 04:13:59.375361', 'step': 2347, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.406163', 'step': 2347, 'epoch': 3} {'type': 'loss', 'content': 0.05559505149722099, 'timestamp': '2025-10-01 04:13:59.431596', 'step': 2348, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.463205', 'step': 2348, 'epoch': 3} {'type': 'loss', 'content': 0.00017471036699134856, 'timestamp': '2025-10-01 04:13:59.466400', 'step': 2349, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:59.500536', 'step': 2349, 'epoch': 3} {'type': 'loss', 'content': 0.06607859581708908, 'timestamp': '2025-10-01 04:13:59.503331', 'step': 2350, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:13:59.534609', 'step': 2350, 'epoch': 3} {'type': 'loss', 'content': 0.0028158756904304028, 'timestamp': '2025-10-01 04:13:59.537181', 'step': 2351, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.568813', 'step': 2351, 'epoch': 3} {'type': 'loss', 'content': 0.0007161787361837924, 'timestamp': '2025-10-01 04:13:59.592904', 'step': 2352, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.625572', 'step': 2352, 'epoch': 3} {'type': 'loss', 'content': 6.582152127521113e-05, 'timestamp': '2025-10-01 04:13:59.627797', 'step': 2353, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:59.660698', 'step': 2353, 'epoch': 3} {'type': 'loss', 'content': 0.016403676941990852, 'timestamp': '2025-10-01 04:13:59.663445', 'step': 2354, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:59.694490', 'step': 2354, 'epoch': 3} {'type': 'loss', 'content': 0.00014711846597492695, 'timestamp': '2025-10-01 04:13:59.697214', 'step': 2355, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.727858', 'step': 2355, 'epoch': 3} {'type': 'loss', 'content': 8.415959746344015e-05, 'timestamp': '2025-10-01 04:13:59.752319', 'step': 2356, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.786727', 'step': 2356, 'epoch': 3} {'type': 'loss', 'content': 0.00010823925549630076, 'timestamp': '2025-10-01 04:13:59.789310', 'step': 2357, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:59.824218', 'step': 2357, 'epoch': 3} {'type': 'loss', 'content': 8.591300866100937e-05, 'timestamp': '2025-10-01 04:13:59.828789', 'step': 2358, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.861217', 'step': 2358, 'epoch': 3} {'type': 'loss', 'content': 0.00018384023860562593, 'timestamp': '2025-10-01 04:13:59.863539', 'step': 2359, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.895521', 'step': 2359, 'epoch': 3} {'type': 'loss', 'content': 0.024127814918756485, 'timestamp': '2025-10-01 04:13:59.919735', 'step': 2360, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:13:59.955433', 'step': 2360, 'epoch': 3} {'type': 'loss', 'content': 8.21871726657264e-05, 'timestamp': '2025-10-01 04:13:59.958042', 'step': 2361, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:13:59.989529', 'step': 2361, 'epoch': 3} {'type': 'loss', 'content': 0.0018586823716759682, 'timestamp': '2025-10-01 04:13:59.993816', 'step': 2362, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.024887', 'step': 2362, 'epoch': 3} {'type': 'loss', 'content': 0.00025929472758434713, 'timestamp': '2025-10-01 04:14:00.027665', 'step': 2363, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.058808', 'step': 2363, 'epoch': 3} {'type': 'loss', 'content': 0.002079632831737399, 'timestamp': '2025-10-01 04:14:00.083423', 'step': 2364, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.115034', 'step': 2364, 'epoch': 3} {'type': 'loss', 'content': 0.02933317981660366, 'timestamp': '2025-10-01 04:14:00.117653', 'step': 2365, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.148211', 'step': 2365, 'epoch': 3} {'type': 'loss', 'content': 0.013687198050320148, 'timestamp': '2025-10-01 04:14:00.150738', 'step': 2366, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.182206', 'step': 2366, 'epoch': 3} {'type': 'loss', 'content': 0.0029233102686703205, 'timestamp': '2025-10-01 04:14:00.185068', 'step': 2367, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:00.215640', 'step': 2367, 'epoch': 3} {'type': 'loss', 'content': 0.010431193746626377, 'timestamp': '2025-10-01 04:14:00.239646', 'step': 2368, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:00.270415', 'step': 2368, 'epoch': 3} {'type': 'loss', 'content': 0.00024859068798832595, 'timestamp': '2025-10-01 04:14:00.273133', 'step': 2369, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.304287', 'step': 2369, 'epoch': 3} {'type': 'loss', 'content': 0.0006030590157024562, 'timestamp': '2025-10-01 04:14:00.306784', 'step': 2370, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:00.337195', 'step': 2370, 'epoch': 3} {'type': 'loss', 'content': 0.043975941836833954, 'timestamp': '2025-10-01 04:14:00.340000', 'step': 2371, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.370964', 'step': 2371, 'epoch': 3} {'type': 'loss', 'content': 0.0028401927556842566, 'timestamp': '2025-10-01 04:14:00.394905', 'step': 2372, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:00.426091', 'step': 2372, 'epoch': 3} {'type': 'loss', 'content': 0.07099797576665878, 'timestamp': '2025-10-01 04:14:00.428473', 'step': 2373, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:00.458817', 'step': 2373, 'epoch': 3} {'type': 'loss', 'content': 0.0027063204906880856, 'timestamp': '2025-10-01 04:14:00.462017', 'step': 2374, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.492093', 'step': 2374, 'epoch': 3} {'type': 'loss', 'content': 0.09009862691164017, 'timestamp': '2025-10-01 04:14:00.495027', 'step': 2375, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.525451', 'step': 2375, 'epoch': 3} {'type': 'loss', 'content': 0.018117502331733704, 'timestamp': '2025-10-01 04:14:00.549383', 'step': 2376, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.580272', 'step': 2376, 'epoch': 3} {'type': 'loss', 'content': 0.0008630921947769821, 'timestamp': '2025-10-01 04:14:00.582748', 'step': 2377, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:00.613816', 'step': 2377, 'epoch': 3} {'type': 'loss', 'content': 0.04184184595942497, 'timestamp': '2025-10-01 04:14:00.616387', 'step': 2378, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:00.647532', 'step': 2378, 'epoch': 3} {'type': 'loss', 'content': 0.0033638239838182926, 'timestamp': '2025-10-01 04:14:00.651025', 'step': 2379, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.682110', 'step': 2379, 'epoch': 3} {'type': 'loss', 'content': 0.003705172333866358, 'timestamp': '2025-10-01 04:14:00.706053', 'step': 2380, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.737177', 'step': 2380, 'epoch': 3} {'type': 'loss', 'content': 0.013101239688694477, 'timestamp': '2025-10-01 04:14:00.739782', 'step': 2381, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:00.771275', 'step': 2381, 'epoch': 3} {'type': 'loss', 'content': 0.003192309755831957, 'timestamp': '2025-10-01 04:14:00.773775', 'step': 2382, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.805257', 'step': 2382, 'epoch': 3} {'type': 'loss', 'content': 0.024804024025797844, 'timestamp': '2025-10-01 04:14:00.807975', 'step': 2383, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.839636', 'step': 2383, 'epoch': 3} {'type': 'loss', 'content': 0.016119828447699547, 'timestamp': '2025-10-01 04:14:00.863745', 'step': 2384, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.895939', 'step': 2384, 'epoch': 3} {'type': 'loss', 'content': 0.005999320652335882, 'timestamp': '2025-10-01 04:14:00.898325', 'step': 2385, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:00.929460', 'step': 2385, 'epoch': 3} {'type': 'loss', 'content': 0.011366310529410839, 'timestamp': '2025-10-01 04:14:00.931995', 'step': 2386, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:00.964068', 'step': 2386, 'epoch': 3} {'type': 'loss', 'content': 0.021910708397626877, 'timestamp': '2025-10-01 04:14:00.966187', 'step': 2387, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:00.997241', 'step': 2387, 'epoch': 3} {'type': 'loss', 'content': 0.01992500014603138, 'timestamp': '2025-10-01 04:14:01.021334', 'step': 2388, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:01.053577', 'step': 2388, 'epoch': 3} {'type': 'loss', 'content': 0.009192356839776039, 'timestamp': '2025-10-01 04:14:01.056098', 'step': 2389, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:01.087673', 'step': 2389, 'epoch': 3} {'type': 'loss', 'content': 0.0116264121606946, 'timestamp': '2025-10-01 04:14:01.090460', 'step': 2390, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:01.122261', 'step': 2390, 'epoch': 3} {'type': 'loss', 'content': 0.011775086633861065, 'timestamp': '2025-10-01 04:14:01.124701', 'step': 2391, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:01.156100', 'step': 2391, 'epoch': 3} {'type': 'loss', 'content': 0.015654785558581352, 'timestamp': '2025-10-01 04:14:01.180111', 'step': 2392, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:01.211124', 'step': 2392, 'epoch': 3} {'type': 'loss', 'content': 0.014394355937838554, 'timestamp': '2025-10-01 04:14:01.215194', 'step': 2393, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:01.245990', 'step': 2393, 'epoch': 3} {'type': 'loss', 'content': 0.007884612306952477, 'timestamp': '2025-10-01 04:14:01.248763', 'step': 2394, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:14:02.017783', 'step': 2394, 'epoch': 3} {'type': 'pplx', 'content': 52674423.42730978, 'timestamp': '2025-10-01 04:14:02.019827', 'step': 2394, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.050004', 'step': 2394, 'epoch': 3} {'type': 'loss', 'content': 0.015185212716460228, 'timestamp': '2025-10-01 04:14:02.052314', 'step': 2395, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.085021', 'step': 2395, 'epoch': 3} {'type': 'loss', 'content': 0.008075162768363953, 'timestamp': '2025-10-01 04:14:02.109345', 'step': 2396, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.143533', 'step': 2396, 'epoch': 3} {'type': 'loss', 'content': 0.012511581182479858, 'timestamp': '2025-10-01 04:14:02.146093', 'step': 2397, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.179979', 'step': 2397, 'epoch': 3} {'type': 'loss', 'content': 0.011734097264707088, 'timestamp': '2025-10-01 04:14:02.182677', 'step': 2398, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:02.216565', 'step': 2398, 'epoch': 3} {'type': 'loss', 'content': 0.01114694681018591, 'timestamp': '2025-10-01 04:14:02.219096', 'step': 2399, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.251696', 'step': 2399, 'epoch': 3} {'type': 'loss', 'content': 0.009856194257736206, 'timestamp': '2025-10-01 04:14:02.276049', 'step': 2400, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.311475', 'step': 2400, 'epoch': 3} {'type': 'loss', 'content': 0.007763002533465624, 'timestamp': '2025-10-01 04:14:02.314508', 'step': 2401, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:14:02.347485', 'step': 2401, 'epoch': 3} {'type': 'loss', 'content': 0.0154880927875638, 'timestamp': '2025-10-01 04:14:02.350438', 'step': 2402, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.384076', 'step': 2402, 'epoch': 3} {'type': 'loss', 'content': 0.014285017736256123, 'timestamp': '2025-10-01 04:14:02.386525', 'step': 2403, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.419442', 'step': 2403, 'epoch': 3} {'type': 'loss', 'content': 0.008886818774044514, 'timestamp': '2025-10-01 04:14:02.443889', 'step': 2404, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.477835', 'step': 2404, 'epoch': 3} {'type': 'loss', 'content': 0.00973739568144083, 'timestamp': '2025-10-01 04:14:02.480309', 'step': 2405, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.527693', 'step': 2405, 'epoch': 3} {'type': 'loss', 'content': 0.01172753144055605, 'timestamp': '2025-10-01 04:14:02.534089', 'step': 2406, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:02.579922', 'step': 2406, 'epoch': 3} {'type': 'loss', 'content': 0.009691670536994934, 'timestamp': '2025-10-01 04:14:02.583021', 'step': 2407, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.615011', 'step': 2407, 'epoch': 3} {'type': 'loss', 'content': 0.008152500726282597, 'timestamp': '2025-10-01 04:14:02.640631', 'step': 2408, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.679784', 'step': 2408, 'epoch': 3} {'type': 'loss', 'content': 0.011871717870235443, 'timestamp': '2025-10-01 04:14:02.681970', 'step': 2409, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.715916', 'step': 2409, 'epoch': 3} {'type': 'loss', 'content': 0.018376769497990608, 'timestamp': '2025-10-01 04:14:02.718064', 'step': 2410, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:02.750256', 'step': 2410, 'epoch': 3} {'type': 'loss', 'content': 0.010106426663696766, 'timestamp': '2025-10-01 04:14:02.754437', 'step': 2411, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:14:02.785988', 'step': 2411, 'epoch': 3} {'type': 'loss', 'content': 0.01129311416298151, 'timestamp': '2025-10-01 04:14:02.810005', 'step': 2412, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.850051', 'step': 2412, 'epoch': 3} {'type': 'loss', 'content': 0.008512974716722965, 'timestamp': '2025-10-01 04:14:02.854452', 'step': 2413, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.886225', 'step': 2413, 'epoch': 3} {'type': 'loss', 'content': 0.006000932771712542, 'timestamp': '2025-10-01 04:14:02.889066', 'step': 2414, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:02.920294', 'step': 2414, 'epoch': 3} {'type': 'loss', 'content': 0.007568769156932831, 'timestamp': '2025-10-01 04:14:02.923494', 'step': 2415, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:02.954710', 'step': 2415, 'epoch': 3} {'type': 'loss', 'content': 0.004484867211431265, 'timestamp': '2025-10-01 04:14:02.978807', 'step': 2416, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:03.011179', 'step': 2416, 'epoch': 3} {'type': 'loss', 'content': 0.006643087603151798, 'timestamp': '2025-10-01 04:14:03.013504', 'step': 2417, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:03.047362', 'step': 2417, 'epoch': 3} {'type': 'loss', 'content': 0.007000813726335764, 'timestamp': '2025-10-01 04:14:03.049777', 'step': 2418, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.081423', 'step': 2418, 'epoch': 3} {'type': 'loss', 'content': 0.012717373669147491, 'timestamp': '2025-10-01 04:14:03.084148', 'step': 2419, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.115610', 'step': 2419, 'epoch': 3} {'type': 'loss', 'content': 0.005935595370829105, 'timestamp': '2025-10-01 04:14:03.142767', 'step': 2420, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.176168', 'step': 2420, 'epoch': 3} {'type': 'loss', 'content': 0.004426135681569576, 'timestamp': '2025-10-01 04:14:03.178773', 'step': 2421, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:03.210338', 'step': 2421, 'epoch': 3} {'type': 'loss', 'content': 0.024797197431325912, 'timestamp': '2025-10-01 04:14:03.212759', 'step': 2422, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.244167', 'step': 2422, 'epoch': 3} {'type': 'loss', 'content': 0.005650327540934086, 'timestamp': '2025-10-01 04:14:03.246269', 'step': 2423, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.278142', 'step': 2423, 'epoch': 3} {'type': 'loss', 'content': 0.006938618142157793, 'timestamp': '2025-10-01 04:14:03.302270', 'step': 2424, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-10-01 04:14:03.333865', 'step': 2424, 'epoch': 3} {'type': 'loss', 'content': 0.0073814005590975285, 'timestamp': '2025-10-01 04:14:03.341872', 'step': 2425, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.374439', 'step': 2425, 'epoch': 3} {'type': 'loss', 'content': 0.004846468102186918, 'timestamp': '2025-10-01 04:14:03.379119', 'step': 2426, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.410258', 'step': 2426, 'epoch': 3} {'type': 'loss', 'content': 0.018962262198328972, 'timestamp': '2025-10-01 04:14:03.412977', 'step': 2427, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:03.457844', 'step': 2427, 'epoch': 3} {'type': 'loss', 'content': 0.0020291144028306007, 'timestamp': '2025-10-01 04:14:03.481960', 'step': 2428, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.514020', 'step': 2428, 'epoch': 3} {'type': 'loss', 'content': 0.011502564884722233, 'timestamp': '2025-10-01 04:14:03.516281', 'step': 2429, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.547165', 'step': 2429, 'epoch': 3} {'type': 'loss', 'content': 0.015590599738061428, 'timestamp': '2025-10-01 04:14:03.550344', 'step': 2430, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:03.583905', 'step': 2430, 'epoch': 3} {'type': 'loss', 'content': 0.004602809436619282, 'timestamp': '2025-10-01 04:14:03.586653', 'step': 2431, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.620359', 'step': 2431, 'epoch': 3} {'type': 'loss', 'content': 0.0069076851941645145, 'timestamp': '2025-10-01 04:14:03.644622', 'step': 2432, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:03.679199', 'step': 2432, 'epoch': 3} {'type': 'loss', 'content': 0.014969917014241219, 'timestamp': '2025-10-01 04:14:03.681570', 'step': 2433, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.714456', 'step': 2433, 'epoch': 3} {'type': 'loss', 'content': 0.003453887300565839, 'timestamp': '2025-10-01 04:14:03.717138', 'step': 2434, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.748953', 'step': 2434, 'epoch': 3} {'type': 'loss', 'content': 0.00447799963876605, 'timestamp': '2025-10-01 04:14:03.751378', 'step': 2435, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.781410', 'step': 2435, 'epoch': 3} {'type': 'loss', 'content': 0.008956313133239746, 'timestamp': '2025-10-01 04:14:03.805616', 'step': 2436, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.842161', 'step': 2436, 'epoch': 3} {'type': 'loss', 'content': 0.001485818182118237, 'timestamp': '2025-10-01 04:14:03.844463', 'step': 2437, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.875202', 'step': 2437, 'epoch': 3} {'type': 'loss', 'content': 0.009877298958599567, 'timestamp': '2025-10-01 04:14:03.877840', 'step': 2438, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.907977', 'step': 2438, 'epoch': 3} {'type': 'loss', 'content': 0.023570170626044273, 'timestamp': '2025-10-01 04:14:03.910359', 'step': 2439, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:03.941080', 'step': 2439, 'epoch': 3} {'type': 'loss', 'content': 0.003152408404275775, 'timestamp': '2025-10-01 04:14:03.965628', 'step': 2440, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:03.998071', 'step': 2440, 'epoch': 3} {'type': 'loss', 'content': 0.006031573750078678, 'timestamp': '2025-10-01 04:14:04.000541', 'step': 2441, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:04.031851', 'step': 2441, 'epoch': 3} {'type': 'loss', 'content': 0.005832657217979431, 'timestamp': '2025-10-01 04:14:04.034058', 'step': 2442, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:04.065568', 'step': 2442, 'epoch': 3} {'type': 'loss', 'content': 0.005281724501401186, 'timestamp': '2025-10-01 04:14:04.067949', 'step': 2443, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:04.097874', 'step': 2443, 'epoch': 3} {'type': 'loss', 'content': 0.005685365758836269, 'timestamp': '2025-10-01 04:14:04.121907', 'step': 2444, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:04.152927', 'step': 2444, 'epoch': 3} {'type': 'loss', 'content': 0.020365357398986816, 'timestamp': '2025-10-01 04:14:04.155486', 'step': 2445, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:04.185968', 'step': 2445, 'epoch': 3} {'type': 'loss', 'content': 0.0016104949172586203, 'timestamp': '2025-10-01 04:14:04.188364', 'step': 2446, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:04.219932', 'step': 2446, 'epoch': 3} {'type': 'loss', 'content': 0.004023754503577948, 'timestamp': '2025-10-01 04:14:04.223340', 'step': 2447, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:04.256884', 'step': 2447, 'epoch': 3} {'type': 'loss', 'content': 0.0035851493012160063, 'timestamp': '2025-10-01 04:14:04.282290', 'step': 2448, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:04.315053', 'step': 2448, 'epoch': 3} {'type': 'loss', 'content': 0.03891408443450928, 'timestamp': '2025-10-01 04:14:04.317928', 'step': 2449, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:04.351340', 'step': 2449, 'epoch': 3} {'type': 'loss', 'content': 0.008011159487068653, 'timestamp': '2025-10-01 04:14:04.354461', 'step': 2450, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:04.388876', 'step': 2450, 'epoch': 3} {'type': 'loss', 'content': 0.016046635806560516, 'timestamp': '2025-10-01 04:14:04.391874', 'step': 2451, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:14:05.221196', 'step': 2451, 'epoch': 3} {'type': 'pplx', 'content': 58824101.59490909, 'timestamp': '2025-10-01 04:14:05.225001', 'step': 2451, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:05.256821', 'step': 2451, 'epoch': 3} {'type': 'loss', 'content': 0.019496530294418335, 'timestamp': '2025-10-01 04:14:05.288776', 'step': 2452, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:05.347822', 'step': 2452, 'epoch': 3} {'type': 'loss', 'content': 0.004884731490164995, 'timestamp': '2025-10-01 04:14:05.350206', 'step': 2453, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:05.383626', 'step': 2453, 'epoch': 3} {'type': 'loss', 'content': 0.0024897728580981493, 'timestamp': '2025-10-01 04:14:05.386011', 'step': 2454, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:05.417936', 'step': 2454, 'epoch': 3} {'type': 'loss', 'content': 0.0227744672447443, 'timestamp': '2025-10-01 04:14:05.421524', 'step': 2455, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:05.457406', 'step': 2455, 'epoch': 3} {'type': 'loss', 'content': 0.001688766060397029, 'timestamp': '2025-10-01 04:14:05.482534', 'step': 2456, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:05.514338', 'step': 2456, 'epoch': 3} {'type': 'loss', 'content': 0.001542889280244708, 'timestamp': '2025-10-01 04:14:05.517040', 'step': 2457, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:05.551005', 'step': 2457, 'epoch': 3} {'type': 'loss', 'content': 0.009548446163535118, 'timestamp': '2025-10-01 04:14:05.553160', 'step': 2458, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:05.586414', 'step': 2458, 'epoch': 3} {'type': 'loss', 'content': 0.004925273358821869, 'timestamp': '2025-10-01 04:14:05.588532', 'step': 2459, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:05.619974', 'step': 2459, 'epoch': 3} {'type': 'loss', 'content': 0.0032642006408423185, 'timestamp': '2025-10-01 04:14:05.646415', 'step': 2460, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:05.678964', 'step': 2460, 'epoch': 3} {'type': 'loss', 'content': 0.0020129564218223095, 'timestamp': '2025-10-01 04:14:05.684312', 'step': 2461, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:05.718026', 'step': 2461, 'epoch': 3} {'type': 'loss', 'content': 0.018269294872879982, 'timestamp': '2025-10-01 04:14:05.720987', 'step': 2462, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:05.762885', 'step': 2462, 'epoch': 3} {'type': 'loss', 'content': 0.02364032343029976, 'timestamp': '2025-10-01 04:14:05.766163', 'step': 2463, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:05.807096', 'step': 2463, 'epoch': 3} {'type': 'loss', 'content': 0.015023179352283478, 'timestamp': '2025-10-01 04:14:05.831458', 'step': 2464, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:05.863001', 'step': 2464, 'epoch': 3} {'type': 'loss', 'content': 0.0021603351924568415, 'timestamp': '2025-10-01 04:14:05.865465', 'step': 2465, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:05.897938', 'step': 2465, 'epoch': 3} {'type': 'loss', 'content': 0.0008657817961648107, 'timestamp': '2025-10-01 04:14:05.901751', 'step': 2466, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:05.938873', 'step': 2466, 'epoch': 3} {'type': 'loss', 'content': 0.005796634126454592, 'timestamp': '2025-10-01 04:14:05.941195', 'step': 2467, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:05.973410', 'step': 2467, 'epoch': 3} {'type': 'loss', 'content': 0.05339441075921059, 'timestamp': '2025-10-01 04:14:05.998674', 'step': 2468, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:06.037270', 'step': 2468, 'epoch': 3} {'type': 'loss', 'content': 0.003976800944656134, 'timestamp': '2025-10-01 04:14:06.040353', 'step': 2469, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:06.074359', 'step': 2469, 'epoch': 3} {'type': 'loss', 'content': 0.03016142174601555, 'timestamp': '2025-10-01 04:14:06.077588', 'step': 2470, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:06.118938', 'step': 2470, 'epoch': 3} {'type': 'loss', 'content': 0.0010798171861097217, 'timestamp': '2025-10-01 04:14:06.121907', 'step': 2471, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:06.154090', 'step': 2471, 'epoch': 3} {'type': 'loss', 'content': 0.0030930840875953436, 'timestamp': '2025-10-01 04:14:06.180904', 'step': 2472, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:06.222815', 'step': 2472, 'epoch': 3} {'type': 'loss', 'content': 0.026820823550224304, 'timestamp': '2025-10-01 04:14:06.226132', 'step': 2473, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:06.261007', 'step': 2473, 'epoch': 3} {'type': 'loss', 'content': 0.009904695674777031, 'timestamp': '2025-10-01 04:14:06.267564', 'step': 2474, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:06.303927', 'step': 2474, 'epoch': 3} {'type': 'loss', 'content': 0.0008741992642171681, 'timestamp': '2025-10-01 04:14:06.307253', 'step': 2475, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:06.343799', 'step': 2475, 'epoch': 3} {'type': 'loss', 'content': 0.0030690168496221304, 'timestamp': '2025-10-01 04:14:06.368367', 'step': 2476, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:06.404554', 'step': 2476, 'epoch': 3} {'type': 'loss', 'content': 0.0016391616081818938, 'timestamp': '2025-10-01 04:14:06.413010', 'step': 2477, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:06.451251', 'step': 2477, 'epoch': 3} {'type': 'loss', 'content': 0.012302136048674583, 'timestamp': '2025-10-01 04:14:06.454209', 'step': 2478, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:06.484875', 'step': 2478, 'epoch': 3} {'type': 'loss', 'content': 0.0012789281317964196, 'timestamp': '2025-10-01 04:14:06.487546', 'step': 2479, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:06.519730', 'step': 2479, 'epoch': 3} {'type': 'loss', 'content': 0.012554749846458435, 'timestamp': '2025-10-01 04:14:06.544387', 'step': 2480, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:06.578237', 'step': 2480, 'epoch': 3} {'type': 'loss', 'content': 0.01758808270096779, 'timestamp': '2025-10-01 04:14:06.580723', 'step': 2481, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:06.612731', 'step': 2481, 'epoch': 3} {'type': 'loss', 'content': 0.0004258893895894289, 'timestamp': '2025-10-01 04:14:06.615496', 'step': 2482, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:06.649647', 'step': 2482, 'epoch': 3} {'type': 'loss', 'content': 0.0005650474922731519, 'timestamp': '2025-10-01 04:14:06.652582', 'step': 2483, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:06.685697', 'step': 2483, 'epoch': 3} {'type': 'loss', 'content': 0.00029421233921311796, 'timestamp': '2025-10-01 04:14:06.709290', 'step': 2484, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:06.739410', 'step': 2484, 'epoch': 3} {'type': 'loss', 'content': 0.02421017363667488, 'timestamp': '2025-10-01 04:14:06.742135', 'step': 2485, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:06.772500', 'step': 2485, 'epoch': 3} {'type': 'loss', 'content': 0.013585188426077366, 'timestamp': '2025-10-01 04:14:06.774728', 'step': 2486, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:06.804932', 'step': 2486, 'epoch': 3} {'type': 'loss', 'content': 0.02027386613190174, 'timestamp': '2025-10-01 04:14:06.807363', 'step': 2487, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:06.837182', 'step': 2487, 'epoch': 3} {'type': 'loss', 'content': 0.03536064550280571, 'timestamp': '2025-10-01 04:14:06.861548', 'step': 2488, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:06.892325', 'step': 2488, 'epoch': 3} {'type': 'loss', 'content': 0.027367528527975082, 'timestamp': '2025-10-01 04:14:06.894669', 'step': 2489, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:06.924585', 'step': 2489, 'epoch': 3} {'type': 'loss', 'content': 0.0001808692904887721, 'timestamp': '2025-10-01 04:14:06.926917', 'step': 2490, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:06.957215', 'step': 2490, 'epoch': 3} {'type': 'loss', 'content': 9.770004544407129e-05, 'timestamp': '2025-10-01 04:14:06.960113', 'step': 2491, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:06.991117', 'step': 2491, 'epoch': 3} {'type': 'loss', 'content': 0.047163479030132294, 'timestamp': '2025-10-01 04:14:07.014873', 'step': 2492, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:07.045727', 'step': 2492, 'epoch': 3} {'type': 'loss', 'content': 0.0003407765761949122, 'timestamp': '2025-10-01 04:14:07.047852', 'step': 2493, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:07.078555', 'step': 2493, 'epoch': 3} {'type': 'loss', 'content': 0.0008606308838352561, 'timestamp': '2025-10-01 04:14:07.081022', 'step': 2494, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:07.120330', 'step': 2494, 'epoch': 3} {'type': 'loss', 'content': 0.02153780125081539, 'timestamp': '2025-10-01 04:14:07.122917', 'step': 2495, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:07.152750', 'step': 2495, 'epoch': 3} {'type': 'loss', 'content': 0.0014575996901839972, 'timestamp': '2025-10-01 04:14:07.176603', 'step': 2496, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:07.206559', 'step': 2496, 'epoch': 3} {'type': 'loss', 'content': 0.0022539051715284586, 'timestamp': '2025-10-01 04:14:07.209256', 'step': 2497, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:07.239927', 'step': 2497, 'epoch': 3} {'type': 'loss', 'content': 0.0031828132923692465, 'timestamp': '2025-10-01 04:14:07.242121', 'step': 2498, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:07.273082', 'step': 2498, 'epoch': 3} {'type': 'loss', 'content': 0.0009762569679878652, 'timestamp': '2025-10-01 04:14:07.275896', 'step': 2499, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:07.308529', 'step': 2499, 'epoch': 3} {'type': 'loss', 'content': 0.001326889730989933, 'timestamp': '2025-10-01 04:14:07.332693', 'step': 2500, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2500', 'timestamp': '2025-10-01 04:14:12.148171', 'step': 2500, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:12.199408', 'step': 2500, 'epoch': 3} {'type': 'loss', 'content': 0.002399686025455594, 'timestamp': '2025-10-01 04:14:12.202025', 'step': 2501, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:12.234214', 'step': 2501, 'epoch': 3} {'type': 'loss', 'content': 0.009733153507113457, 'timestamp': '2025-10-01 04:14:12.236908', 'step': 2502, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:12.268922', 'step': 2502, 'epoch': 3} {'type': 'loss', 'content': 0.031686168164014816, 'timestamp': '2025-10-01 04:14:12.271423', 'step': 2503, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:12.302847', 'step': 2503, 'epoch': 3} {'type': 'loss', 'content': 0.049481626600027084, 'timestamp': '2025-10-01 04:14:12.327083', 'step': 2504, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:12.359654', 'step': 2504, 'epoch': 3} {'type': 'loss', 'content': 0.0006520474562421441, 'timestamp': '2025-10-01 04:14:12.362180', 'step': 2505, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:12.393758', 'step': 2505, 'epoch': 3} {'type': 'loss', 'content': 0.015490526333451271, 'timestamp': '2025-10-01 04:14:12.397216', 'step': 2506, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:12.429563', 'step': 2506, 'epoch': 3} {'type': 'loss', 'content': 0.003928300458937883, 'timestamp': '2025-10-01 04:14:12.431997', 'step': 2507, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:12.462731', 'step': 2507, 'epoch': 3} {'type': 'loss', 'content': 0.031879812479019165, 'timestamp': '2025-10-01 04:14:12.488256', 'step': 2508, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:14:13.246351', 'step': 2508, 'epoch': 3} {'type': 'pplx', 'content': 60019287.10333549, 'timestamp': '2025-10-01 04:14:13.248288', 'step': 2508, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:13.276815', 'step': 2508, 'epoch': 3} {'type': 'loss', 'content': 0.05604308471083641, 'timestamp': '2025-10-01 04:14:13.278948', 'step': 2509, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:13.309323', 'step': 2509, 'epoch': 3} {'type': 'loss', 'content': 0.007913426496088505, 'timestamp': '2025-10-01 04:14:13.312382', 'step': 2510, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:13.346529', 'step': 2510, 'epoch': 3} {'type': 'loss', 'content': 0.00519518880173564, 'timestamp': '2025-10-01 04:14:13.349919', 'step': 2511, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:13.385194', 'step': 2511, 'epoch': 3} {'type': 'loss', 'content': 0.0252819936722517, 'timestamp': '2025-10-01 04:14:13.410825', 'step': 2512, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:13.444937', 'step': 2512, 'epoch': 3} {'type': 'loss', 'content': 0.029163526371121407, 'timestamp': '2025-10-01 04:14:13.448372', 'step': 2513, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:13.481925', 'step': 2513, 'epoch': 3} {'type': 'loss', 'content': 0.003907273523509502, 'timestamp': '2025-10-01 04:14:13.485025', 'step': 2514, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:13.518343', 'step': 2514, 'epoch': 3} {'type': 'loss', 'content': 0.012420463375747204, 'timestamp': '2025-10-01 04:14:13.521087', 'step': 2515, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:13.556710', 'step': 2515, 'epoch': 3} {'type': 'loss', 'content': 0.01203235238790512, 'timestamp': '2025-10-01 04:14:13.581378', 'step': 2516, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:13.613494', 'step': 2516, 'epoch': 3} {'type': 'loss', 'content': 0.005583175923675299, 'timestamp': '2025-10-01 04:14:13.616904', 'step': 2517, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:13.651226', 'step': 2517, 'epoch': 3} {'type': 'loss', 'content': 0.0028394819237291813, 'timestamp': '2025-10-01 04:14:13.654329', 'step': 2518, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:13.690369', 'step': 2518, 'epoch': 3} {'type': 'loss', 'content': 0.024792209267616272, 'timestamp': '2025-10-01 04:14:13.693631', 'step': 2519, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:13.726359', 'step': 2519, 'epoch': 3} {'type': 'loss', 'content': 0.0031874720007181168, 'timestamp': '2025-10-01 04:14:13.751824', 'step': 2520, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:13.784826', 'step': 2520, 'epoch': 3} {'type': 'loss', 'content': 0.0023829268757253885, 'timestamp': '2025-10-01 04:14:13.788388', 'step': 2521, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:13.822803', 'step': 2521, 'epoch': 3} {'type': 'loss', 'content': 0.0028786410111933947, 'timestamp': '2025-10-01 04:14:13.825762', 'step': 2522, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:13.858045', 'step': 2522, 'epoch': 3} {'type': 'loss', 'content': 0.0209118090569973, 'timestamp': '2025-10-01 04:14:13.861200', 'step': 2523, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:13.894897', 'step': 2523, 'epoch': 3} {'type': 'loss', 'content': 0.005318788345903158, 'timestamp': '2025-10-01 04:14:13.918761', 'step': 2524, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:13.951190', 'step': 2524, 'epoch': 3} {'type': 'loss', 'content': 0.007208620198071003, 'timestamp': '2025-10-01 04:14:13.954917', 'step': 2525, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:13.992272', 'step': 2525, 'epoch': 3} {'type': 'loss', 'content': 0.004015002399682999, 'timestamp': '2025-10-01 04:14:13.996192', 'step': 2526, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:14.030480', 'step': 2526, 'epoch': 3} {'type': 'loss', 'content': 0.009447134099900723, 'timestamp': '2025-10-01 04:14:14.033944', 'step': 2527, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:14.067732', 'step': 2527, 'epoch': 3} {'type': 'loss', 'content': 0.03116469271481037, 'timestamp': '2025-10-01 04:14:14.092425', 'step': 2528, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:14.127600', 'step': 2528, 'epoch': 3} {'type': 'loss', 'content': 0.0030621045734733343, 'timestamp': '2025-10-01 04:14:14.130340', 'step': 2529, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:14.163733', 'step': 2529, 'epoch': 3} {'type': 'loss', 'content': 0.00671646511182189, 'timestamp': '2025-10-01 04:14:14.167129', 'step': 2530, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:14.200423', 'step': 2530, 'epoch': 3} {'type': 'loss', 'content': 0.005486390553414822, 'timestamp': '2025-10-01 04:14:14.203687', 'step': 2531, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:14.237212', 'step': 2531, 'epoch': 3} {'type': 'loss', 'content': 0.0026912540197372437, 'timestamp': '2025-10-01 04:14:14.261531', 'step': 2532, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:14.294926', 'step': 2532, 'epoch': 3} {'type': 'loss', 'content': 0.030198076739907265, 'timestamp': '2025-10-01 04:14:14.298234', 'step': 2533, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:14.331095', 'step': 2533, 'epoch': 3} {'type': 'loss', 'content': 0.02517978474497795, 'timestamp': '2025-10-01 04:14:14.334202', 'step': 2534, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:14.368731', 'step': 2534, 'epoch': 3} {'type': 'loss', 'content': 0.00408556591719389, 'timestamp': '2025-10-01 04:14:14.371962', 'step': 2535, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:14.404303', 'step': 2535, 'epoch': 3} {'type': 'loss', 'content': 0.00379009242169559, 'timestamp': '2025-10-01 04:14:14.428750', 'step': 2536, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:14.463135', 'step': 2536, 'epoch': 3} {'type': 'loss', 'content': 0.0170338936150074, 'timestamp': '2025-10-01 04:14:14.466208', 'step': 2537, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:14.499354', 'step': 2537, 'epoch': 3} {'type': 'loss', 'content': 0.01684098318219185, 'timestamp': '2025-10-01 04:14:14.502471', 'step': 2538, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:14.536295', 'step': 2538, 'epoch': 3} {'type': 'loss', 'content': 0.014581390656530857, 'timestamp': '2025-10-01 04:14:14.540270', 'step': 2539, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:14.574547', 'step': 2539, 'epoch': 3} {'type': 'loss', 'content': 0.0013121498050168157, 'timestamp': '2025-10-01 04:14:14.599204', 'step': 2540, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:14.632799', 'step': 2540, 'epoch': 3} {'type': 'loss', 'content': 0.0016053777653723955, 'timestamp': '2025-10-01 04:14:14.636173', 'step': 2541, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:14.671324', 'step': 2541, 'epoch': 3} {'type': 'loss', 'content': 0.003269800217822194, 'timestamp': '2025-10-01 04:14:14.674486', 'step': 2542, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:14.707494', 'step': 2542, 'epoch': 3} {'type': 'loss', 'content': 0.003128649899736047, 'timestamp': '2025-10-01 04:14:14.710926', 'step': 2543, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:14.743966', 'step': 2543, 'epoch': 3} {'type': 'loss', 'content': 0.03637135028839111, 'timestamp': '2025-10-01 04:14:14.768427', 'step': 2544, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:14:14.800812', 'step': 2544, 'epoch': 3} {'type': 'loss', 'content': 0.005533142946660519, 'timestamp': '2025-10-01 04:14:14.803153', 'step': 2545, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:14.836575', 'step': 2545, 'epoch': 3} {'type': 'loss', 'content': 0.0069894432090222836, 'timestamp': '2025-10-01 04:14:14.839261', 'step': 2546, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:14.872293', 'step': 2546, 'epoch': 3} {'type': 'loss', 'content': 0.0031484358478337526, 'timestamp': '2025-10-01 04:14:14.875897', 'step': 2547, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:14.910486', 'step': 2547, 'epoch': 3} {'type': 'loss', 'content': 0.002964512910693884, 'timestamp': '2025-10-01 04:14:14.935708', 'step': 2548, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:14.969640', 'step': 2548, 'epoch': 3} {'type': 'loss', 'content': 0.013624334707856178, 'timestamp': '2025-10-01 04:14:14.973723', 'step': 2549, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:15.005054', 'step': 2549, 'epoch': 3} {'type': 'loss', 'content': 0.010864500887691975, 'timestamp': '2025-10-01 04:14:15.007410', 'step': 2550, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:15.037818', 'step': 2550, 'epoch': 3} {'type': 'loss', 'content': 0.015076599083840847, 'timestamp': '2025-10-01 04:14:15.040439', 'step': 2551, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:15.071172', 'step': 2551, 'epoch': 3} {'type': 'loss', 'content': 0.0038652902003377676, 'timestamp': '2025-10-01 04:14:15.094701', 'step': 2552, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:15.125585', 'step': 2552, 'epoch': 3} {'type': 'loss', 'content': 0.0028856198769062757, 'timestamp': '2025-10-01 04:14:15.128081', 'step': 2553, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:15.165898', 'step': 2553, 'epoch': 3} {'type': 'loss', 'content': 0.021579725667834282, 'timestamp': '2025-10-01 04:14:15.168973', 'step': 2554, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:15.200424', 'step': 2554, 'epoch': 3} {'type': 'loss', 'content': 0.0009834429947659373, 'timestamp': '2025-10-01 04:14:15.202996', 'step': 2555, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:15.235241', 'step': 2555, 'epoch': 3} {'type': 'loss', 'content': 0.0009941781172528863, 'timestamp': '2025-10-01 04:14:15.259290', 'step': 2556, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:15.291422', 'step': 2556, 'epoch': 3} {'type': 'loss', 'content': 0.0052528358064591885, 'timestamp': '2025-10-01 04:14:15.294602', 'step': 2557, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:15.326972', 'step': 2557, 'epoch': 3} {'type': 'loss', 'content': 0.02679816260933876, 'timestamp': '2025-10-01 04:14:15.334507', 'step': 2558, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:15.375140', 'step': 2558, 'epoch': 3} {'type': 'loss', 'content': 0.026580344885587692, 'timestamp': '2025-10-01 04:14:15.378071', 'step': 2559, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:15.409296', 'step': 2559, 'epoch': 3} {'type': 'loss', 'content': 0.0031278219539672136, 'timestamp': '2025-10-01 04:14:15.433265', 'step': 2560, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:15.463964', 'step': 2560, 'epoch': 3} {'type': 'loss', 'content': 0.010876401327550411, 'timestamp': '2025-10-01 04:14:15.466255', 'step': 2561, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:15.496994', 'step': 2561, 'epoch': 3} {'type': 'loss', 'content': 0.002058443846181035, 'timestamp': '2025-10-01 04:14:15.499590', 'step': 2562, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:15.529767', 'step': 2562, 'epoch': 3} {'type': 'loss', 'content': 0.002466656733304262, 'timestamp': '2025-10-01 04:14:15.532093', 'step': 2563, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:15.562601', 'step': 2563, 'epoch': 3} {'type': 'loss', 'content': 0.013181686401367188, 'timestamp': '2025-10-01 04:14:15.586676', 'step': 2564, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:15.617457', 'step': 2564, 'epoch': 3} {'type': 'loss', 'content': 0.005432442296296358, 'timestamp': '2025-10-01 04:14:15.619867', 'step': 2565, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:14:16.382678', 'step': 2565, 'epoch': 3} {'type': 'pplx', 'content': 50190753.812678196, 'timestamp': '2025-10-01 04:14:16.384933', 'step': 2565, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:16.414573', 'step': 2565, 'epoch': 3} {'type': 'loss', 'content': 0.014795447699725628, 'timestamp': '2025-10-01 04:14:16.417114', 'step': 2566, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:16.448651', 'step': 2566, 'epoch': 3} {'type': 'loss', 'content': 0.005171437747776508, 'timestamp': '2025-10-01 04:14:16.451078', 'step': 2567, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:16.482332', 'step': 2567, 'epoch': 3} {'type': 'loss', 'content': 0.003639362519606948, 'timestamp': '2025-10-01 04:14:16.506129', 'step': 2568, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:16.537415', 'step': 2568, 'epoch': 3} {'type': 'loss', 'content': 0.001160087645985186, 'timestamp': '2025-10-01 04:14:16.540098', 'step': 2569, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:16.569950', 'step': 2569, 'epoch': 3} {'type': 'loss', 'content': 0.010151208378374577, 'timestamp': '2025-10-01 04:14:16.572165', 'step': 2570, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:16.602491', 'step': 2570, 'epoch': 3} {'type': 'loss', 'content': 0.011499554850161076, 'timestamp': '2025-10-01 04:14:16.604979', 'step': 2571, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:16.635943', 'step': 2571, 'epoch': 3} {'type': 'loss', 'content': 0.006028040777891874, 'timestamp': '2025-10-01 04:14:16.659814', 'step': 2572, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:16.690759', 'step': 2572, 'epoch': 3} {'type': 'loss', 'content': 0.0026039001531898975, 'timestamp': '2025-10-01 04:14:16.693138', 'step': 2573, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:16.727217', 'step': 2573, 'epoch': 3} {'type': 'loss', 'content': 0.0056630512699484825, 'timestamp': '2025-10-01 04:14:16.730655', 'step': 2574, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:16.761711', 'step': 2574, 'epoch': 3} {'type': 'loss', 'content': 0.010089273564517498, 'timestamp': '2025-10-01 04:14:16.763937', 'step': 2575, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:16.795221', 'step': 2575, 'epoch': 3} {'type': 'loss', 'content': 0.004652588162571192, 'timestamp': '2025-10-01 04:14:16.819081', 'step': 2576, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:16.850124', 'step': 2576, 'epoch': 3} {'type': 'loss', 'content': 0.003421203466132283, 'timestamp': '2025-10-01 04:14:16.852842', 'step': 2577, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:16.887000', 'step': 2577, 'epoch': 3} {'type': 'loss', 'content': 0.014077236875891685, 'timestamp': '2025-10-01 04:14:16.889395', 'step': 2578, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:16.920436', 'step': 2578, 'epoch': 3} {'type': 'loss', 'content': 0.0071408068761229515, 'timestamp': '2025-10-01 04:14:16.924259', 'step': 2579, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:16.955419', 'step': 2579, 'epoch': 3} {'type': 'loss', 'content': 0.004757278598845005, 'timestamp': '2025-10-01 04:14:16.979514', 'step': 2580, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:17.011149', 'step': 2580, 'epoch': 3} {'type': 'loss', 'content': 0.002742630196735263, 'timestamp': '2025-10-01 04:14:17.013407', 'step': 2581, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:17.044745', 'step': 2581, 'epoch': 3} {'type': 'loss', 'content': 0.004369654227048159, 'timestamp': '2025-10-01 04:14:17.047351', 'step': 2582, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:17.078686', 'step': 2582, 'epoch': 3} {'type': 'loss', 'content': 0.004402398131787777, 'timestamp': '2025-10-01 04:14:17.086949', 'step': 2583, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:17.117369', 'step': 2583, 'epoch': 3} {'type': 'loss', 'content': 0.0065698400139808655, 'timestamp': '2025-10-01 04:14:17.141250', 'step': 2584, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:17.173057', 'step': 2584, 'epoch': 3} {'type': 'loss', 'content': 0.008920601569116116, 'timestamp': '2025-10-01 04:14:17.175493', 'step': 2585, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:17.206361', 'step': 2585, 'epoch': 3} {'type': 'loss', 'content': 0.04373012110590935, 'timestamp': '2025-10-01 04:14:17.208825', 'step': 2586, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:17.239396', 'step': 2586, 'epoch': 3} {'type': 'loss', 'content': 0.005760248750448227, 'timestamp': '2025-10-01 04:14:17.241940', 'step': 2587, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:17.273584', 'step': 2587, 'epoch': 3} {'type': 'loss', 'content': 0.0028052874840795994, 'timestamp': '2025-10-01 04:14:17.297589', 'step': 2588, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:17.328965', 'step': 2588, 'epoch': 3} {'type': 'loss', 'content': 0.0062300702556967735, 'timestamp': '2025-10-01 04:14:17.331347', 'step': 2589, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:17.363434', 'step': 2589, 'epoch': 3} {'type': 'loss', 'content': 0.010736130177974701, 'timestamp': '2025-10-01 04:14:17.365760', 'step': 2590, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:17.399448', 'step': 2590, 'epoch': 3} {'type': 'loss', 'content': 0.005298420321196318, 'timestamp': '2025-10-01 04:14:17.403017', 'step': 2591, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:17.434729', 'step': 2591, 'epoch': 3} {'type': 'loss', 'content': 0.002143836347386241, 'timestamp': '2025-10-01 04:14:17.459706', 'step': 2592, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:17.493614', 'step': 2592, 'epoch': 3} {'type': 'loss', 'content': 0.0014678857987746596, 'timestamp': '2025-10-01 04:14:17.495902', 'step': 2593, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:17.526838', 'step': 2593, 'epoch': 3} {'type': 'loss', 'content': 0.0037119637709110975, 'timestamp': '2025-10-01 04:14:17.529537', 'step': 2594, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:17.561227', 'step': 2594, 'epoch': 3} {'type': 'loss', 'content': 0.011288485489785671, 'timestamp': '2025-10-01 04:14:17.564211', 'step': 2595, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-10-01 04:14:17.595353', 'step': 2595, 'epoch': 3} {'type': 'loss', 'content': 0.02088664285838604, 'timestamp': '2025-10-01 04:14:17.619374', 'step': 2596, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:17.650492', 'step': 2596, 'epoch': 3} {'type': 'loss', 'content': 0.0031742292921990156, 'timestamp': '2025-10-01 04:14:17.652633', 'step': 2597, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:17.682786', 'step': 2597, 'epoch': 3} {'type': 'loss', 'content': 0.0014582558069378138, 'timestamp': '2025-10-01 04:14:17.685425', 'step': 2598, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:17.719604', 'step': 2598, 'epoch': 3} {'type': 'loss', 'content': 0.007793535012751818, 'timestamp': '2025-10-01 04:14:17.722033', 'step': 2599, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:17.754084', 'step': 2599, 'epoch': 3} {'type': 'loss', 'content': 0.002564321970567107, 'timestamp': '2025-10-01 04:14:17.777854', 'step': 2600, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:17.809076', 'step': 2600, 'epoch': 3} {'type': 'loss', 'content': 0.008807080797851086, 'timestamp': '2025-10-01 04:14:17.811676', 'step': 2601, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:17.843123', 'step': 2601, 'epoch': 3} {'type': 'loss', 'content': 0.008557865396142006, 'timestamp': '2025-10-01 04:14:17.845710', 'step': 2602, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:17.875868', 'step': 2602, 'epoch': 3} {'type': 'loss', 'content': 0.0007431748090311885, 'timestamp': '2025-10-01 04:14:17.878083', 'step': 2603, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:17.909375', 'step': 2603, 'epoch': 3} {'type': 'loss', 'content': 0.023754892870783806, 'timestamp': '2025-10-01 04:14:17.933487', 'step': 2604, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:17.966059', 'step': 2604, 'epoch': 3} {'type': 'loss', 'content': 0.017130501568317413, 'timestamp': '2025-10-01 04:14:17.968495', 'step': 2605, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:17.999133', 'step': 2605, 'epoch': 3} {'type': 'loss', 'content': 0.0002764493983704597, 'timestamp': '2025-10-01 04:14:18.001793', 'step': 2606, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:18.032473', 'step': 2606, 'epoch': 3} {'type': 'loss', 'content': 0.021002618595957756, 'timestamp': '2025-10-01 04:14:18.036349', 'step': 2607, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:18.068330', 'step': 2607, 'epoch': 3} {'type': 'loss', 'content': 0.001229580258950591, 'timestamp': '2025-10-01 04:14:18.092287', 'step': 2608, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:18.121831', 'step': 2608, 'epoch': 3} {'type': 'loss', 'content': 0.006011900492012501, 'timestamp': '2025-10-01 04:14:18.123843', 'step': 2609, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:18.156104', 'step': 2609, 'epoch': 3} {'type': 'loss', 'content': 0.0011188319185748696, 'timestamp': '2025-10-01 04:14:18.159077', 'step': 2610, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:18.194921', 'step': 2610, 'epoch': 3} {'type': 'loss', 'content': 0.0035923365503549576, 'timestamp': '2025-10-01 04:14:18.197480', 'step': 2611, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:18.228575', 'step': 2611, 'epoch': 3} {'type': 'loss', 'content': 0.0025855558924376965, 'timestamp': '2025-10-01 04:14:18.255153', 'step': 2612, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:18.292945', 'step': 2612, 'epoch': 3} {'type': 'loss', 'content': 0.04813668876886368, 'timestamp': '2025-10-01 04:14:18.295171', 'step': 2613, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:18.326638', 'step': 2613, 'epoch': 3} {'type': 'loss', 'content': 0.00044949265429750085, 'timestamp': '2025-10-01 04:14:18.329294', 'step': 2614, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:18.360404', 'step': 2614, 'epoch': 3} {'type': 'loss', 'content': 0.0025543272495269775, 'timestamp': '2025-10-01 04:14:18.362947', 'step': 2615, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:18.395636', 'step': 2615, 'epoch': 3} {'type': 'loss', 'content': 0.0008481559343636036, 'timestamp': '2025-10-01 04:14:18.419708', 'step': 2616, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:18.451499', 'step': 2616, 'epoch': 3} {'type': 'loss', 'content': 0.008253288455307484, 'timestamp': '2025-10-01 04:14:18.455312', 'step': 2617, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:18.489097', 'step': 2617, 'epoch': 3} {'type': 'loss', 'content': 0.0006239612703211606, 'timestamp': '2025-10-01 04:14:18.492523', 'step': 2618, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:18.524474', 'step': 2618, 'epoch': 3} {'type': 'loss', 'content': 0.00023207686899695545, 'timestamp': '2025-10-01 04:14:18.527162', 'step': 2619, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:18.558301', 'step': 2619, 'epoch': 3} {'type': 'loss', 'content': 4.780263043357991e-05, 'timestamp': '2025-10-01 04:14:18.582702', 'step': 2620, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:18.615807', 'step': 2620, 'epoch': 3} {'type': 'loss', 'content': 2.830891935445834e-05, 'timestamp': '2025-10-01 04:14:18.617963', 'step': 2621, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:18.648595', 'step': 2621, 'epoch': 3} {'type': 'loss', 'content': 0.00848412699997425, 'timestamp': '2025-10-01 04:14:18.651200', 'step': 2622, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:14:19.403155', 'step': 2622, 'epoch': 3} {'type': 'pplx', 'content': 63792326.1143382, 'timestamp': '2025-10-01 04:14:19.405245', 'step': 2622, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:19.434615', 'step': 2622, 'epoch': 3} {'type': 'loss', 'content': 0.0022637471556663513, 'timestamp': '2025-10-01 04:14:19.438355', 'step': 2623, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:19.469816', 'step': 2623, 'epoch': 3} {'type': 'loss', 'content': 3.309018211439252e-05, 'timestamp': '2025-10-01 04:14:19.493957', 'step': 2624, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:19.524313', 'step': 2624, 'epoch': 3} {'type': 'loss', 'content': 0.002056579804047942, 'timestamp': '2025-10-01 04:14:19.528433', 'step': 2625, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:19.561793', 'step': 2625, 'epoch': 3} {'type': 'loss', 'content': 0.01622423343360424, 'timestamp': '2025-10-01 04:14:19.564654', 'step': 2626, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:19.595641', 'step': 2626, 'epoch': 3} {'type': 'loss', 'content': 0.00038660448626615107, 'timestamp': '2025-10-01 04:14:19.599181', 'step': 2627, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:19.629235', 'step': 2627, 'epoch': 3} {'type': 'loss', 'content': 0.008610601536929607, 'timestamp': '2025-10-01 04:14:19.653427', 'step': 2628, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:19.687698', 'step': 2628, 'epoch': 3} {'type': 'loss', 'content': 0.0007182090193964541, 'timestamp': '2025-10-01 04:14:19.690621', 'step': 2629, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:19.724730', 'step': 2629, 'epoch': 3} {'type': 'loss', 'content': 0.00033229749533347785, 'timestamp': '2025-10-01 04:14:19.728034', 'step': 2630, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:19.758266', 'step': 2630, 'epoch': 3} {'type': 'loss', 'content': 0.000259073858615011, 'timestamp': '2025-10-01 04:14:19.760804', 'step': 2631, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:19.794734', 'step': 2631, 'epoch': 3} {'type': 'loss', 'content': 0.004574598278850317, 'timestamp': '2025-10-01 04:14:19.819707', 'step': 2632, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:19.850263', 'step': 2632, 'epoch': 3} {'type': 'loss', 'content': 0.00040219916263595223, 'timestamp': '2025-10-01 04:14:19.853064', 'step': 2633, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:19.884737', 'step': 2633, 'epoch': 3} {'type': 'loss', 'content': 0.0024398225359618664, 'timestamp': '2025-10-01 04:14:19.887181', 'step': 2634, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:19.917698', 'step': 2634, 'epoch': 3} {'type': 'loss', 'content': 0.00624752277508378, 'timestamp': '2025-10-01 04:14:19.920816', 'step': 2635, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:19.952694', 'step': 2635, 'epoch': 3} {'type': 'loss', 'content': 0.028992334380745888, 'timestamp': '2025-10-01 04:14:19.976994', 'step': 2636, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:20.008692', 'step': 2636, 'epoch': 3} {'type': 'loss', 'content': 0.004315625410526991, 'timestamp': '2025-10-01 04:14:20.011478', 'step': 2637, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:20.043629', 'step': 2637, 'epoch': 3} {'type': 'loss', 'content': 0.0007517217891290784, 'timestamp': '2025-10-01 04:14:20.046281', 'step': 2638, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:20.076944', 'step': 2638, 'epoch': 3} {'type': 'loss', 'content': 0.004551793914288282, 'timestamp': '2025-10-01 04:14:20.079535', 'step': 2639, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:20.110600', 'step': 2639, 'epoch': 3} {'type': 'loss', 'content': 0.002312313299626112, 'timestamp': '2025-10-01 04:14:20.134985', 'step': 2640, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:20.166039', 'step': 2640, 'epoch': 3} {'type': 'loss', 'content': 0.0001878144685178995, 'timestamp': '2025-10-01 04:14:20.168931', 'step': 2641, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:20.199235', 'step': 2641, 'epoch': 3} {'type': 'loss', 'content': 0.0020596140529960394, 'timestamp': '2025-10-01 04:14:20.202099', 'step': 2642, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:20.233575', 'step': 2642, 'epoch': 3} {'type': 'loss', 'content': 0.007440303452312946, 'timestamp': '2025-10-01 04:14:20.237761', 'step': 2643, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:20.269287', 'step': 2643, 'epoch': 3} {'type': 'loss', 'content': 0.00196535955183208, 'timestamp': '2025-10-01 04:14:20.295526', 'step': 2644, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:20.326685', 'step': 2644, 'epoch': 3} {'type': 'loss', 'content': 0.0003387908509466797, 'timestamp': '2025-10-01 04:14:20.329188', 'step': 2645, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:20.361934', 'step': 2645, 'epoch': 3} {'type': 'loss', 'content': 0.0007078064372763038, 'timestamp': '2025-10-01 04:14:20.365156', 'step': 2646, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:20.397017', 'step': 2646, 'epoch': 3} {'type': 'loss', 'content': 0.018304944038391113, 'timestamp': '2025-10-01 04:14:20.399560', 'step': 2647, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:20.431741', 'step': 2647, 'epoch': 3} {'type': 'loss', 'content': 0.0036956293042749166, 'timestamp': '2025-10-01 04:14:20.455468', 'step': 2648, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:20.487426', 'step': 2648, 'epoch': 3} {'type': 'loss', 'content': 0.0026661923620849848, 'timestamp': '2025-10-01 04:14:20.490062', 'step': 2649, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:20.520600', 'step': 2649, 'epoch': 3} {'type': 'loss', 'content': 0.0004781464231200516, 'timestamp': '2025-10-01 04:14:20.523290', 'step': 2650, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:20.557150', 'step': 2650, 'epoch': 3} {'type': 'loss', 'content': 0.0003797627578023821, 'timestamp': '2025-10-01 04:14:20.560244', 'step': 2651, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:20.594731', 'step': 2651, 'epoch': 3} {'type': 'loss', 'content': 0.000620409962721169, 'timestamp': '2025-10-01 04:14:20.618941', 'step': 2652, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:20.650015', 'step': 2652, 'epoch': 3} {'type': 'loss', 'content': 0.0004895195597782731, 'timestamp': '2025-10-01 04:14:20.652324', 'step': 2653, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:20.683946', 'step': 2653, 'epoch': 3} {'type': 'loss', 'content': 0.011492596007883549, 'timestamp': '2025-10-01 04:14:20.688089', 'step': 2654, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:20.721129', 'step': 2654, 'epoch': 3} {'type': 'loss', 'content': 0.03194532170891762, 'timestamp': '2025-10-01 04:14:20.723852', 'step': 2655, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:20.754910', 'step': 2655, 'epoch': 3} {'type': 'loss', 'content': 0.0101211192086339, 'timestamp': '2025-10-01 04:14:20.779419', 'step': 2656, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:20.810669', 'step': 2656, 'epoch': 3} {'type': 'loss', 'content': 0.00036434808862395585, 'timestamp': '2025-10-01 04:14:20.812945', 'step': 2657, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:20.844970', 'step': 2657, 'epoch': 3} {'type': 'loss', 'content': 0.0037049304228276014, 'timestamp': '2025-10-01 04:14:20.847690', 'step': 2658, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:20.877959', 'step': 2658, 'epoch': 3} {'type': 'loss', 'content': 0.00512391421943903, 'timestamp': '2025-10-01 04:14:20.880870', 'step': 2659, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:20.911538', 'step': 2659, 'epoch': 3} {'type': 'loss', 'content': 0.00954340398311615, 'timestamp': '2025-10-01 04:14:20.935517', 'step': 2660, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:20.966641', 'step': 2660, 'epoch': 3} {'type': 'loss', 'content': 0.0002692260022740811, 'timestamp': '2025-10-01 04:14:20.969091', 'step': 2661, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:20.999909', 'step': 2661, 'epoch': 3} {'type': 'loss', 'content': 0.0017530341865494847, 'timestamp': '2025-10-01 04:14:21.002738', 'step': 2662, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:21.034472', 'step': 2662, 'epoch': 3} {'type': 'loss', 'content': 0.00228483765386045, 'timestamp': '2025-10-01 04:14:21.037543', 'step': 2663, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:21.067825', 'step': 2663, 'epoch': 3} {'type': 'loss', 'content': 0.02074584737420082, 'timestamp': '2025-10-01 04:14:21.091774', 'step': 2664, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:21.122398', 'step': 2664, 'epoch': 3} {'type': 'loss', 'content': 0.0003323450800962746, 'timestamp': '2025-10-01 04:14:21.125973', 'step': 2665, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:21.157028', 'step': 2665, 'epoch': 3} {'type': 'loss', 'content': 0.00015716443886049092, 'timestamp': '2025-10-01 04:14:21.159907', 'step': 2666, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:21.192533', 'step': 2666, 'epoch': 3} {'type': 'loss', 'content': 0.0005555102252401412, 'timestamp': '2025-10-01 04:14:21.194946', 'step': 2667, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:21.226412', 'step': 2667, 'epoch': 3} {'type': 'loss', 'content': 0.0007580574601888657, 'timestamp': '2025-10-01 04:14:21.250773', 'step': 2668, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:21.281355', 'step': 2668, 'epoch': 3} {'type': 'loss', 'content': 4.993396214558743e-05, 'timestamp': '2025-10-01 04:14:21.284168', 'step': 2669, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:21.314955', 'step': 2669, 'epoch': 3} {'type': 'loss', 'content': 0.008731653913855553, 'timestamp': '2025-10-01 04:14:21.318030', 'step': 2670, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:21.349832', 'step': 2670, 'epoch': 3} {'type': 'loss', 'content': 0.0017039062222465873, 'timestamp': '2025-10-01 04:14:21.353765', 'step': 2671, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:21.384203', 'step': 2671, 'epoch': 3} {'type': 'loss', 'content': 0.0005672592669725418, 'timestamp': '2025-10-01 04:14:21.408231', 'step': 2672, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:21.439187', 'step': 2672, 'epoch': 3} {'type': 'loss', 'content': 0.004365603905171156, 'timestamp': '2025-10-01 04:14:21.441872', 'step': 2673, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:21.478691', 'step': 2673, 'epoch': 3} {'type': 'loss', 'content': 0.00015389944019261748, 'timestamp': '2025-10-01 04:14:21.481530', 'step': 2674, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:21.513860', 'step': 2674, 'epoch': 3} {'type': 'loss', 'content': 0.0008061733096837997, 'timestamp': '2025-10-01 04:14:21.516433', 'step': 2675, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:21.547673', 'step': 2675, 'epoch': 3} {'type': 'loss', 'content': 0.0013293975498527288, 'timestamp': '2025-10-01 04:14:21.571717', 'step': 2676, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:21.602273', 'step': 2676, 'epoch': 3} {'type': 'loss', 'content': 0.001839006319642067, 'timestamp': '2025-10-01 04:14:21.604901', 'step': 2677, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:21.636975', 'step': 2677, 'epoch': 3} {'type': 'loss', 'content': 0.002645869040861726, 'timestamp': '2025-10-01 04:14:21.639764', 'step': 2678, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:21.670532', 'step': 2678, 'epoch': 3} {'type': 'loss', 'content': 0.00023593958758283406, 'timestamp': '2025-10-01 04:14:21.673214', 'step': 2679, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:14:22.439115', 'step': 2679, 'epoch': 3} {'type': 'pplx', 'content': 64678083.10080412, 'timestamp': '2025-10-01 04:14:22.445683', 'step': 2679, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:22.475693', 'step': 2679, 'epoch': 3} {'type': 'loss', 'content': 2.630149720062036e-05, 'timestamp': '2025-10-01 04:14:22.499577', 'step': 2680, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:22.531997', 'step': 2680, 'epoch': 3} {'type': 'loss', 'content': 0.00016436410078313202, 'timestamp': '2025-10-01 04:14:22.534194', 'step': 2681, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:22.570502', 'step': 2681, 'epoch': 3} {'type': 'loss', 'content': 0.00018075850675813854, 'timestamp': '2025-10-01 04:14:22.573125', 'step': 2682, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:22.605089', 'step': 2682, 'epoch': 3} {'type': 'loss', 'content': 0.00015388346218969673, 'timestamp': '2025-10-01 04:14:22.611991', 'step': 2683, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:22.654972', 'step': 2683, 'epoch': 3} {'type': 'loss', 'content': 5.065502045908943e-05, 'timestamp': '2025-10-01 04:14:22.679014', 'step': 2684, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:22.709670', 'step': 2684, 'epoch': 3} {'type': 'loss', 'content': 9.08417787286453e-05, 'timestamp': '2025-10-01 04:14:22.712393', 'step': 2685, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:22.743437', 'step': 2685, 'epoch': 3} {'type': 'loss', 'content': 0.0006787800812162459, 'timestamp': '2025-10-01 04:14:22.747002', 'step': 2686, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:22.778437', 'step': 2686, 'epoch': 3} {'type': 'loss', 'content': 0.0005651679239235818, 'timestamp': '2025-10-01 04:14:22.780923', 'step': 2687, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:22.814151', 'step': 2687, 'epoch': 3} {'type': 'loss', 'content': 0.00019555458857212216, 'timestamp': '2025-10-01 04:14:22.838244', 'step': 2688, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:22.871133', 'step': 2688, 'epoch': 3} {'type': 'loss', 'content': 1.657810207689181e-05, 'timestamp': '2025-10-01 04:14:22.873991', 'step': 2689, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:22.905790', 'step': 2689, 'epoch': 3} {'type': 'loss', 'content': 0.0001516373740741983, 'timestamp': '2025-10-01 04:14:22.908134', 'step': 2690, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:22.945389', 'step': 2690, 'epoch': 3} {'type': 'loss', 'content': 0.00043833054951392114, 'timestamp': '2025-10-01 04:14:22.948425', 'step': 2691, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:22.980571', 'step': 2691, 'epoch': 3} {'type': 'loss', 'content': 0.00034414706169627607, 'timestamp': '2025-10-01 04:14:23.005092', 'step': 2692, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.036093', 'step': 2692, 'epoch': 3} {'type': 'loss', 'content': 0.0003396179818082601, 'timestamp': '2025-10-01 04:14:23.038656', 'step': 2693, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.069169', 'step': 2693, 'epoch': 3} {'type': 'loss', 'content': 0.00028526881942525506, 'timestamp': '2025-10-01 04:14:23.071806', 'step': 2694, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.102330', 'step': 2694, 'epoch': 3} {'type': 'loss', 'content': 0.00022634358901996166, 'timestamp': '2025-10-01 04:14:23.104959', 'step': 2695, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:23.135372', 'step': 2695, 'epoch': 3} {'type': 'loss', 'content': 0.0003560289624147117, 'timestamp': '2025-10-01 04:14:23.159269', 'step': 2696, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.189584', 'step': 2696, 'epoch': 3} {'type': 'loss', 'content': 2.8566690161824226e-05, 'timestamp': '2025-10-01 04:14:23.192495', 'step': 2697, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.223438', 'step': 2697, 'epoch': 3} {'type': 'loss', 'content': 0.00014089918113313615, 'timestamp': '2025-10-01 04:14:23.225946', 'step': 2698, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.256127', 'step': 2698, 'epoch': 3} {'type': 'loss', 'content': 0.00024118837609421462, 'timestamp': '2025-10-01 04:14:23.258490', 'step': 2699, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:23.289144', 'step': 2699, 'epoch': 3} {'type': 'loss', 'content': 0.0001321942690992728, 'timestamp': '2025-10-01 04:14:23.313059', 'step': 2700, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.343977', 'step': 2700, 'epoch': 3} {'type': 'loss', 'content': 0.0001804428466130048, 'timestamp': '2025-10-01 04:14:23.346306', 'step': 2701, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.377256', 'step': 2701, 'epoch': 3} {'type': 'loss', 'content': 5.039268216933124e-05, 'timestamp': '2025-10-01 04:14:23.380200', 'step': 2702, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:23.410924', 'step': 2702, 'epoch': 3} {'type': 'loss', 'content': 0.000560373708140105, 'timestamp': '2025-10-01 04:14:23.414103', 'step': 2703, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.448321', 'step': 2703, 'epoch': 3} {'type': 'loss', 'content': 0.02016831561923027, 'timestamp': '2025-10-01 04:14:23.473051', 'step': 2704, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.505733', 'step': 2704, 'epoch': 3} {'type': 'loss', 'content': 7.694788655498996e-05, 'timestamp': '2025-10-01 04:14:23.509563', 'step': 2705, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.543125', 'step': 2705, 'epoch': 3} {'type': 'loss', 'content': 3.4393237001495436e-05, 'timestamp': '2025-10-01 04:14:23.546537', 'step': 2706, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.580126', 'step': 2706, 'epoch': 3} {'type': 'loss', 'content': 0.00016384324408136308, 'timestamp': '2025-10-01 04:14:23.583589', 'step': 2707, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.625307', 'step': 2707, 'epoch': 3} {'type': 'loss', 'content': 0.0013702316209673882, 'timestamp': '2025-10-01 04:14:23.649747', 'step': 2708, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.683137', 'step': 2708, 'epoch': 3} {'type': 'loss', 'content': 0.0005768322153016925, 'timestamp': '2025-10-01 04:14:23.686257', 'step': 2709, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.718163', 'step': 2709, 'epoch': 3} {'type': 'loss', 'content': 0.002737879054620862, 'timestamp': '2025-10-01 04:14:23.720987', 'step': 2710, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:23.755528', 'step': 2710, 'epoch': 3} {'type': 'loss', 'content': 5.805503678857349e-05, 'timestamp': '2025-10-01 04:14:23.759385', 'step': 2711, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.800299', 'step': 2711, 'epoch': 3} {'type': 'loss', 'content': 1.628149584576022e-05, 'timestamp': '2025-10-01 04:14:23.825425', 'step': 2712, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.859178', 'step': 2712, 'epoch': 3} {'type': 'loss', 'content': 0.0020005072001367807, 'timestamp': '2025-10-01 04:14:23.862185', 'step': 2713, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:23.894207', 'step': 2713, 'epoch': 3} {'type': 'loss', 'content': 0.09073884040117264, 'timestamp': '2025-10-01 04:14:23.897399', 'step': 2714, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.929788', 'step': 2714, 'epoch': 3} {'type': 'loss', 'content': 0.04919225350022316, 'timestamp': '2025-10-01 04:14:23.933008', 'step': 2715, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:23.965571', 'step': 2715, 'epoch': 3} {'type': 'loss', 'content': 0.01102256216108799, 'timestamp': '2025-10-01 04:14:23.990031', 'step': 2716, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:24.023163', 'step': 2716, 'epoch': 3} {'type': 'loss', 'content': 0.014040348120033741, 'timestamp': '2025-10-01 04:14:24.026486', 'step': 2717, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:24.062599', 'step': 2717, 'epoch': 3} {'type': 'loss', 'content': 2.2828298824606463e-05, 'timestamp': '2025-10-01 04:14:24.066467', 'step': 2718, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:24.098871', 'step': 2718, 'epoch': 3} {'type': 'loss', 'content': 0.0004682219005189836, 'timestamp': '2025-10-01 04:14:24.101749', 'step': 2719, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:24.134718', 'step': 2719, 'epoch': 3} {'type': 'loss', 'content': 0.021064382046461105, 'timestamp': '2025-10-01 04:14:24.160136', 'step': 2720, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:24.192876', 'step': 2720, 'epoch': 3} {'type': 'loss', 'content': 0.0023182774893939495, 'timestamp': '2025-10-01 04:14:24.195912', 'step': 2721, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:24.228385', 'step': 2721, 'epoch': 3} {'type': 'loss', 'content': 9.966571815311909e-05, 'timestamp': '2025-10-01 04:14:24.231945', 'step': 2722, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:24.265017', 'step': 2722, 'epoch': 3} {'type': 'loss', 'content': 0.036073338240385056, 'timestamp': '2025-10-01 04:14:24.268566', 'step': 2723, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:24.300716', 'step': 2723, 'epoch': 3} {'type': 'loss', 'content': 0.0005764077068306506, 'timestamp': '2025-10-01 04:14:24.324838', 'step': 2724, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:24.358191', 'step': 2724, 'epoch': 3} {'type': 'loss', 'content': 0.004021498374640942, 'timestamp': '2025-10-01 04:14:24.361235', 'step': 2725, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:24.395162', 'step': 2725, 'epoch': 3} {'type': 'loss', 'content': 0.0007272333605214953, 'timestamp': '2025-10-01 04:14:24.398432', 'step': 2726, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:24.430728', 'step': 2726, 'epoch': 3} {'type': 'loss', 'content': 0.0004156237991992384, 'timestamp': '2025-10-01 04:14:24.433363', 'step': 2727, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:24.465723', 'step': 2727, 'epoch': 3} {'type': 'loss', 'content': 0.004409321118146181, 'timestamp': '2025-10-01 04:14:24.491592', 'step': 2728, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:24.525686', 'step': 2728, 'epoch': 3} {'type': 'loss', 'content': 0.00032090378226712346, 'timestamp': '2025-10-01 04:14:24.528148', 'step': 2729, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:24.559962', 'step': 2729, 'epoch': 3} {'type': 'loss', 'content': 0.005059172865003347, 'timestamp': '2025-10-01 04:14:24.562831', 'step': 2730, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:24.599559', 'step': 2730, 'epoch': 3} {'type': 'loss', 'content': 0.0022533093579113483, 'timestamp': '2025-10-01 04:14:24.602996', 'step': 2731, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:24.637850', 'step': 2731, 'epoch': 3} {'type': 'loss', 'content': 0.0028393480461090803, 'timestamp': '2025-10-01 04:14:24.662388', 'step': 2732, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:24.696061', 'step': 2732, 'epoch': 3} {'type': 'loss', 'content': 0.0021085678599774837, 'timestamp': '2025-10-01 04:14:24.699093', 'step': 2733, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:24.732334', 'step': 2733, 'epoch': 3} {'type': 'loss', 'content': 0.004252140875905752, 'timestamp': '2025-10-01 04:14:24.735370', 'step': 2734, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:24.768097', 'step': 2734, 'epoch': 3} {'type': 'loss', 'content': 0.013400801457464695, 'timestamp': '2025-10-01 04:14:24.771066', 'step': 2735, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:24.805823', 'step': 2735, 'epoch': 3} {'type': 'loss', 'content': 0.018713532015681267, 'timestamp': '2025-10-01 04:14:24.830526', 'step': 2736, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:14:25.644065', 'step': 2736, 'epoch': 3} {'type': 'pplx', 'content': 68029215.63403933, 'timestamp': '2025-10-01 04:14:25.645970', 'step': 2736, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:25.674612', 'step': 2736, 'epoch': 3} {'type': 'loss', 'content': 0.0023472080938518047, 'timestamp': '2025-10-01 04:14:25.676892', 'step': 2737, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:25.708452', 'step': 2737, 'epoch': 3} {'type': 'loss', 'content': 0.009190792217850685, 'timestamp': '2025-10-01 04:14:25.711120', 'step': 2738, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:25.741240', 'step': 2738, 'epoch': 3} {'type': 'loss', 'content': 0.015830060467123985, 'timestamp': '2025-10-01 04:14:25.743599', 'step': 2739, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:25.773643', 'step': 2739, 'epoch': 3} {'type': 'loss', 'content': 0.008671787567436695, 'timestamp': '2025-10-01 04:14:25.797839', 'step': 2740, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:25.828459', 'step': 2740, 'epoch': 3} {'type': 'loss', 'content': 0.008564976043999195, 'timestamp': '2025-10-01 04:14:25.830663', 'step': 2741, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:25.861727', 'step': 2741, 'epoch': 3} {'type': 'loss', 'content': 0.002389447530731559, 'timestamp': '2025-10-01 04:14:25.863992', 'step': 2742, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:25.893764', 'step': 2742, 'epoch': 3} {'type': 'loss', 'content': 0.01760433055460453, 'timestamp': '2025-10-01 04:14:25.896777', 'step': 2743, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:25.927483', 'step': 2743, 'epoch': 3} {'type': 'loss', 'content': 0.019632836803793907, 'timestamp': '2025-10-01 04:14:25.951270', 'step': 2744, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:25.981671', 'step': 2744, 'epoch': 3} {'type': 'loss', 'content': 0.000977389863692224, 'timestamp': '2025-10-01 04:14:25.983860', 'step': 2745, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:26.014891', 'step': 2745, 'epoch': 3} {'type': 'loss', 'content': 0.0004742510209325701, 'timestamp': '2025-10-01 04:14:26.017322', 'step': 2746, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:26.049127', 'step': 2746, 'epoch': 3} {'type': 'loss', 'content': 0.0005396570777520537, 'timestamp': '2025-10-01 04:14:26.052329', 'step': 2747, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-10-01 04:14:26.083573', 'step': 2747, 'epoch': 3} {'type': 'loss', 'content': 0.001477371552027762, 'timestamp': '2025-10-01 04:14:26.107906', 'step': 2748, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:26.139308', 'step': 2748, 'epoch': 3} {'type': 'loss', 'content': 0.0005041286931373179, 'timestamp': '2025-10-01 04:14:26.141655', 'step': 2749, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-10-01 04:14:26.172138', 'step': 2749, 'epoch': 3} {'type': 'loss', 'content': 0.0012628821423277259, 'timestamp': '2025-10-01 04:14:26.174280', 'step': 2750, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-10-01 04:14:26.204653', 'step': 2750, 'epoch': 3} {'type': 'loss', 'content': 0.0002868495066650212, 'timestamp': '2025-10-01 04:14:26.207392', 'step': 2751, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-10-01 04:14:26.953400', 'step': 2751, 'epoch': 3} {'type': 'pplx', 'content': 38842487.79769442, 'timestamp': '2025-10-01 04:14:26.955402', 'step': 2751, 'epoch': 3} {'type': 'best_pplx', 'content': 38842487.79769442, 'timestamp': '2025-10-01 04:14:26.956761', 'step': 2751, 'epoch': 3} {'type': 'best_step', 'content': 2751, 'timestamp': '2025-10-01 04:14:26.958432', 'step': 2751, 'epoch': 3} {'type': 'total_pplx_flops', 'content': 5014951860256000, 'timestamp': '2025-10-01 04:14:26.959981', 'step': 2751, 'epoch': 3} {'type': 'total_train_flops', 'content': 10640863719936576, 'timestamp': '2025-10-01 04:14:26.961873', 'step': 2751, 'epoch': 3}